In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [10]:
## import packages
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.simplefilter('ignore')

In [13]:
## load data
train = pl.read_csv(r'playground-series-s4e7\train.csv')
test = pl.read_csv(r'playground-series-s4e7\test.csv')
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [21]:
# 데이터 준비
train_data = train.clone()
test_data = test.clone()

# train_data와 test_data 합치기
df = pl.concat([train_data, test_data])

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

df = df.with_columns([
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Annual_Premium'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Age'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Damage'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vintage')
])

minmax_scaler = MinMaxScaler()
df[['Annual_Premium_MinMax', 'Vintage_MinMax']] = minmax_scaler.fit_transform(df[['Annual_Premium', 'Vintage']])

train_df = df[:train_data.shape[0]].to_pandas()
test_df = df[train_data.shape[0]:].to_pandas()

cat_features = ['Gender', 'Region_Code', 'Vehicle_Age', 'Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 
                'Previously_Insured_Annual_Premium', 'Previously_Insured_Vehicle_Age', 'Previously_Insured_Vehicle_Damage', 'Previously_Insured_Vintage']

train_df

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,str,i64,i64,f64,i64,str,str,f64,f64,i64,i64
0,"""Male""",21,1,35.0,0,"""1-2 Year""","""Yes""",65101.0,124.0,187,0
1,"""Male""",43,1,28.0,0,"""> 2 Years""","""Yes""",58911.0,26.0,288,1
2,"""Female""",25,1,14.0,1,"""< 1 Year""","""No""",38043.0,152.0,254,0
3,"""Female""",35,1,1.0,0,"""1-2 Year""","""Yes""",2630.0,156.0,76,0
4,"""Female""",36,1,15.0,1,"""1-2 Year""","""No""",31951.0,152.0,294,0
…,…,…,…,…,…,…,…,…,…,…,…
11504793,"""Male""",48,1,6.0,0,"""1-2 Year""","""Yes""",27412.0,26.0,218,0
11504794,"""Female""",26,1,36.0,0,"""< 1 Year""","""Yes""",29509.0,152.0,115,1
11504795,"""Female""",29,1,32.0,1,"""< 1 Year""","""No""",2630.0,152.0,189,0
11504796,"""Female""",51,1,28.0,0,"""1-2 Year""","""Yes""",48443.0,26.0,274,1


In [22]:
## train model
aucs = []
preds = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df, train_df['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train_df.loc[train_idx, [c for c in train_df.columns if c not in ['id', 'Response']]]
    y_train = train_df.loc[train_idx, 'Response']
    X_valid = train_df.loc[valid_idx, X_train.columns]
    y_valid = train_df.loc[valid_idx, 'Response']
    X_test = test_df[X_train.columns]

    X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
    X_test_pool = Pool(X_test, cat_features=cat_features)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=5000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)

    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')

### Fold 1 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8765283	best: 0.8765283 (0)	total: 758ms	remaining: 1h 3m 7s
1000:	test: 0.8943628	best: 0.8943628 (1000)	total: 13m 51s	remaining: 55m 20s
2000:	test: 0.8947250	best: 0.8947250 (2000)	total: 27m 41s	remaining: 41m 30s
3000:	test: 0.8948618	best: 0.8948620 (2995)	total: 48m 43s	remaining: 32m 27s
bestTest = 0.8949000239
bestIteration = 3549
Shrink model to first 3550 iterations.
Fold 1 AUC: 0.89490

### Fold 2 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8757312	best: 0.8757312 (0)	total: 563ms	remaining: 46m 56s
1000:	test: 0.8939885	best: 0.8939885 (1000)	total: 18m 43s	remaining: 1h 14m 50s
2000:	test: 0.8943479	best: 0.8943481 (1999)	total: 43m 10s	remaining: 1h 4m 43s
3000:	test: 0.8944547	best: 0.8944553 (2957)	total: 1h 6m 46s	remaining: 44m 28s
4000:	test: 0.8945054	best: 0.8945063 (3993)	total: 1h 27m 4s	remaining: 21m 44s
bestTest = 0.8945185542
bestIteration = 4722
Shrink model to first 4723 iterations.
Fold 2 AUC: 0.89452

### Fold 3 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8763183	best: 0.8763183 (0)	total: 643ms	remaining: 53m 32s
1000:	test: 0.8942968	best: 0.8942968 (1000)	total: 14m 9s	remaining: 56m 35s
2000:	test: 0.8946223	best: 0.8946223 (2000)	total: 32m 40s	remaining: 48m 57s
3000:	test: 0.8947343	best: 0.8947343 (3000)	total: 57m 4s	remaining: 38m
4000:	test: 0.8947803	best: 0.8947819 (3969)	total: 1h 14m 13s	remaining: 18m 31s
bestTest = 0.8947911859
bestIteration = 4339
Shrink model to first 4340 iterations.
Fold 3 AUC: 0.89479

### Fold 4 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8763419	best: 0.8763419 (0)	total: 838ms	remaining: 1h 9m 49s
1000:	test: 0.8941040	best: 0.8941040 (1000)	total: 14m 29s	remaining: 57m 53s
2000:	test: 0.8944540	best: 0.8944540 (2000)	total: 34m 6s	remaining: 51m 6s
3000:	test: 0.8945985	best: 0.8945985 (3000)	total: 50m 37s	remaining: 33m 43s
4000:	test: 0.8946546	best: 0.8946548 (3995)	total: 1h 6m 46s	remaining: 16m 40s
4999:	test: 0.8946835	best: 0.8946884 (4921)	total: 1h 22m 52s	remaining: 0us
bestTest = 0.8946884274
bestIteration = 4921
Shrink model to first 4922 iterations.
Fold 4 AUC: 0.89469

### Fold 5 Training ###


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8768645	best: 0.8768645 (0)	total: 629ms	remaining: 52m 26s
1000:	test: 0.8947490	best: 0.8947490 (999)	total: 13m 2s	remaining: 52m 5s
2000:	test: 0.8951185	best: 0.8951187 (1999)	total: 26m 22s	remaining: 39m 32s
3000:	test: 0.8952327	best: 0.8952327 (3000)	total: 40m 3s	remaining: 26m 40s
4000:	test: 0.8952767	best: 0.8952767 (4000)	total: 53m 40s	remaining: 13m 23s
bestTest = 0.8952852488
bestIteration = 4334
Shrink model to first 4335 iterations.
Fold 5 AUC: 0.89529


Overall AUC: 0.89484 +/- 0.00026


In [23]:
## create submission
submission = test[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('submission_240717_2.csv', index=False)
submission

TypeError: DataFrame object does not support `Series` assignment by index

Use `DataFrame.with_columns`.