In [1]:
## import packages
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.simplefilter('ignore')

In [2]:
## load data
train = pl.read_csv('contents/train_kaggle_2.csv')
test = pl.read_csv('contents/test_kaggle.csv')
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [3]:
train = train.to_pandas()
train.head(5)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,...,Previously_Insured_Vehicle_Damage,Gender_Driving_License_Previously_Insured,Gender_Driving_License_Vehicle_Damage,Gender_Previously_Insured_Vehicle_Damage,Driving_License_Previously_Insured_Vehicle_Damage,Gender_Driving_License_Previously_Insured_Vehicle_Damage,Age_by_Vehicle_Age,Age_by_Driving_License,Age_by_Vehicle_Damage,Vehicle_Age_by_Vehicle_Damage
0,0,1.035,1,19,0,0.5,0,-0.04166,25,-0.5547,...,0,10,10,0,100,100,0.5176,1.035,0.0,0.0
1,0,0.4197,1,23,0,0.49,1,-0.003643,155,0.9517,...,1,10,11,1,101,101,0.2057,0.4197,0.4197,0.49
2,0,0.3218,1,13,1,0.5,0,-0.06604,14,-0.2001,...,10,11,10,10,110,110,0.1609,0.3218,0.0,0.0
3,0,0.08124,1,2,0,0.4336,1,-0.01257,154,0.3538,...,1,10,11,1,101,101,0.03525,0.08124,0.08124,0.4336
4,0,1.344,1,11,1,0.5,1,-0.00341,4,0.477,...,11,11,11,11,111,111,0.672,1.344,1.344,0.5


In [4]:
test = test.to_pandas()
test.head(5)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,...,Gender_Driving_License_Previously_Insured,Gender_Driving_License_Vehicle_Damage,Gender_Previously_Insured_Vehicle_Damage,Driving_License_Previously_Insured_Vehicle_Damage,Gender_Driving_License_Previously_Insured_Vehicle_Damage,Age_by_Vehicle_Age,Age_by_Driving_License,Age_by_Vehicle_Damage,Vehicle_Age_by_Vehicle_Damage,Response
0,0,-1.503906,1,47,0,0.0,0,-0.229478,160,0.801483,...,10,10,0,100,100,-0.0,-1.503906,-0.0,0.0,0
1,1,0.719727,1,28,0,0.5,1,0.040216,124,-0.511353,...,110,111,101,101,1101,0.359863,0.719727,0.719727,0.5,0
2,1,0.719727,1,43,0,0.5,1,-0.229478,26,1.339121,...,110,111,101,101,1101,0.359863,0.719727,0.719727,0.5,0
3,0,-1.253906,1,47,1,0.0,0,-0.05218,152,-0.611378,...,11,10,10,110,110,-0.0,-1.253906,-0.0,0.0,0
4,1,0.933594,1,19,0,0.5,0,0.016722,124,-0.198773,...,110,110,100,100,1100,0.466797,0.933594,0.0,0.0,0


In [9]:
aucs = []
preds = []
# Identifying categorical features explicitly
categorical_features = [c for c in train.columns if train[c].dtype == 'object' or train[c].dtype == 'category']
numeric_features = [c for c in train.columns if c not in categorical_features + ['id', 'Response']]

# Ensure categorical features are treated as strings
train[categorical_features] = train[categorical_features].astype(str)
test[categorical_features] = test[categorical_features].astype(str)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, numeric_features + categorical_features]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, numeric_features + categorical_features]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test[numeric_features + categorical_features]

    # Ensure all categorical features are treated as strings
    X_train[categorical_features] = X_train[categorical_features].astype(str)
    X_valid[categorical_features] = X_valid[categorical_features].astype(str)
    X_test[categorical_features] = X_test[categorical_features].astype(str)

    # Create Pool objects for CatBoost
    X_train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)
    X_test_pool = Pool(X_test, cat_features=categorical_features)

    # Initialize and fit the model
    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=500,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=100, early_stopping_rounds=200)

    # Predictions and evaluation
    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')

### Fold 1 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8196584	best: 0.8196584 (0)	total: 74.5ms	remaining: 6m 12s
100:	test: 0.9455897	best: 0.9455897 (100)	total: 6.93s	remaining: 5m 36s


KeyboardInterrupt: 

In [7]:
test_id = pl.read_csv('contents/test.csv')
test_id = test_id.to_pandas()

In [8]:
## create submission
submission = test_id[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,Response
0,11504798,0.997118
1,11504799,0.987900
2,11504800,0.997053
3,11504801,0.999526
4,11504802,0.994716
...,...,...
7669861,19174659,0.983807
7669862,19174660,0.999005
7669863,19174661,0.007442
7669864,19174662,0.999990
