In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
## import packages
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.simplefilter('ignore')

In [None]:
## load data
train = pl.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/insurance_selling/datasets/train.csv')
test = pl.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/insurance_selling/datasets/test.csv')
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [None]:
## prepare data
df = pl.concat([train, test])

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

df = df.with_columns([
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Annual_Premium'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Age'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Damage'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vintage')
])

train = df[:train.shape[0]].to_pandas()
test = df[train.shape[0]:].to_pandas()

train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Previously_Insured_Annual_Premium,Previously_Insured_Vehicle_Age,Previously_Insured_Vehicle_Damage,Previously_Insured_Vintage
0,0,0,21,1,35,0,1,1,65101,124,187,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1,1,1,0,1
2,2,1,25,1,14,1,0,0,38043,152,254,0,2,2,1,2
3,3,1,35,1,1,0,1,1,2630,156,76,0,3,0,0,3
4,4,1,36,1,15,1,1,0,31951,152,294,0,4,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,11504793,0,48,1,6,0,1,1,27412,26,218,0,5210,0,0,144
11504794,11504794,1,26,1,36,0,0,1,29509,152,115,1,23274,4,0,176
11504795,11504795,1,29,1,32,1,0,0,2630,152,189,0,18,2,1,456
11504796,11504796,1,51,1,28,0,1,1,48443,26,274,1,14121,0,0,124


In [None]:
## train model
aucs = []
preds = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test[X_train.columns]

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)
    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=5000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)

    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')

### Fold 1 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8757758	best: 0.8757758 (0)	total: 799ms	remaining: 1h 6m 33s
1000:	test: 0.8946300	best: 0.8946300 (1000)	total: 14m 25s	remaining: 57m 39s
2000:	test: 0.8949861	best: 0.8949866 (1994)	total: 28m 41s	remaining: 42m 59s
3000:	test: 0.8950861	best: 0.8950871 (2993)	total: 43m	remaining: 28m 38s
4000:	test: 0.8951309	best: 0.8951325 (3957)	total: 57m 20s	remaining: 14m 19s
bestTest = 0.8951368332
bestIteration = 4189
Shrink model to first 4190 iterations.
Fold 1 AUC: 0.89514

### Fold 2 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751965	best: 0.8751965 (0)	total: 794ms	remaining: 1h 6m 7s
1000:	test: 0.8942412	best: 0.8942412 (1000)	total: 14m 25s	remaining: 57m 36s
2000:	test: 0.8946090	best: 0.8946090 (1999)	total: 28m 33s	remaining: 42m 48s
3000:	test: 0.8947293	best: 0.8947300 (2982)	total: 42m 50s	remaining: 28m 32s
4000:	test: 0.8947675	best: 0.8947684 (3989)	total: 56m 57s	remaining: 14m 13s
bestTest = 0.8947761953
bestIteration = 4229
Shrink model to first 4230 iterations.
Fold 2 AUC: 0.89478

### Fold 3 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8754973	best: 0.8754973 (0)	total: 792ms	remaining: 1h 5m 58s
1000:	test: 0.8945488	best: 0.8945488 (1000)	total: 14m 24s	remaining: 57m 32s
2000:	test: 0.8948848	best: 0.8948848 (2000)	total: 28m 38s	remaining: 42m 55s
3000:	test: 0.8949862	best: 0.8949864 (2999)	total: 42m 55s	remaining: 28m 35s
bestTest = 0.8950245976
bestIteration = 3746
Shrink model to first 3747 iterations.
Fold 3 AUC: 0.89502

### Fold 4 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751477	best: 0.8751477 (0)	total: 957ms	remaining: 1h 19m 44s
1000:	test: 0.8943317	best: 0.8943319 (999)	total: 14m 27s	remaining: 57m 45s
2000:	test: 0.8946844	best: 0.8946850 (1990)	total: 28m 41s	remaining: 42m 59s
3000:	test: 0.8947928	best: 0.8947928 (3000)	total: 42m 50s	remaining: 28m 32s
4000:	test: 0.8948307	best: 0.8948310 (3944)	total: 57m	remaining: 14m 14s
bestTest = 0.8948370516
bestIteration = 4174
Shrink model to first 4175 iterations.
Fold 4 AUC: 0.89484

### Fold 5 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8762398	best: 0.8762398 (0)	total: 794ms	remaining: 1h 6m 11s
1000:	test: 0.8950258	best: 0.8950258 (1000)	total: 14m 19s	remaining: 57m 15s
2000:	test: 0.8953809	best: 0.8953809 (1998)	total: 28m 30s	remaining: 42m 43s
3000:	test: 0.8954781	best: 0.8954783 (2999)	total: 42m 47s	remaining: 28m 30s
4000:	test: 0.8955048	best: 0.8955050 (3999)	total: 56m 57s	remaining: 14m 13s
bestTest = 0.8955051303
bestIteration = 4029
Shrink model to first 4030 iterations.
Fold 5 AUC: 0.89551


Overall AUC: 0.89506 +/- 0.00026


In [None]:
## create submission
submission = test[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/insurance_selling/submission.csv', index=False)
submission

Unnamed: 0,id,Response
0,11504798,0.004739
1,11504799,0.667146
2,11504800,0.237916
3,11504801,0.000084
4,11504802,0.246370
...,...,...
7669861,19174659,0.197856
7669862,19174660,0.000147
7669863,19174661,0.000491
7669864,19174662,0.589127
