In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [28]:
## import packages
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.simplefilter('ignore')

os.environ["CUDA_VISIBLE_DEVICES"]= "0"

In [29]:
## load data
train = pl.read_csv(r'playground-series-s4e7\train.csv')
test = pl.read_csv(r'playground-series-s4e7\test.csv')
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [32]:
# 데이터 준비
train_data = train.clone()
test_data = test.clone()

# train_data와 test_data 합치기
df = pl.concat([train_data, test_data])

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

df = df.with_columns([
    pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0]).alias('Previously_Insured_Annual_Premium'),
    pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0]).alias('Previously_Insured_Vehicle_Age'),
    pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0]).alias('Previously_Insured_Vehicle_Damage'),
    pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0]).alias('Previously_Insured_Vintage')
])

train_df = df[:train_data.shape[0]].to_pandas()
test_df = df[train_data.shape[0]:].to_pandas()

cat_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Previously_Insured']


In [34]:
# 모델 학습 및 예측
aucs = []
preds = []

X = train_df.drop(['id', 'Response'], axis=1)
y = train_df['Response']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f'### Fold {fold+1} Training ###')

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_valid = X.iloc[valid_idx]
    y_valid = y.iloc[valid_idx]
    X_test = test_df[X.columns]

    # 랜덤 포레스트 모델 학습
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict_proba(X_valid)[:, 1]

    # XGBoost 모델 학습
    xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=42, tree_method='gpu_hist')
    xgb_model.fit(X_train, y_train)
    xgb_pred = xgb_model.predict_proba(X_valid)[:, 1]

    # LightGBM 모델 학습
    lgbm_model = LGBMClassifier(n_estimators=100, learning_rate=0.05, random_state=42, device='gpu')
    lgbm_model.fit(X_train, y_train)
    lgbm_pred = lgbm_model.predict_proba(X_valid)[:, 1]

    # CatBoost 모델 학습
    X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)
    cat_model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=5000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        random_seed=42,
        verbose=False
    )
    cat_model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)
    cat_pred = cat_model.predict_proba(X_valid_pool)[:, 1]

    # 앙상블 예측 (평균)
    ensemble_pred = (rf_pred + xgb_pred + lgbm_pred + cat_pred) / 4

    # AUC 계산
    auc = roc_auc_score(y_valid, ensemble_pred)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')

### Fold 1 Training ###
[LightGBM] [Info] Number of positive: 1132047, number of negative: 8071791
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1254
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 13 dense feature groups (140.44 MB) transferred to GPU in 0.068940 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964348
[LightGBM] [Info] Start training from score -1.964348


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8531035	best: 0.8531035 (0)	total: 402ms	remaining: 33m 32s
1000:	test: 0.8797110	best: 0.8797110 (1000)	total: 4m 26s	remaining: 17m 45s
2000:	test: 0.8814330	best: 0.8814330 (2000)	total: 8m 50s	remaining: 13m 15s
3000:	test: 0.8822491	best: 0.8822491 (3000)	total: 13m 14s	remaining: 8m 49s
4000:	test: 0.8828071	best: 0.8828071 (4000)	total: 17m 43s	remaining: 4m 25s
4999:	test: 0.8831635	best: 0.8831637 (4998)	total: 22m 11s	remaining: 0us
bestTest = 0.8831636906
bestIteration = 4998
Shrink model to first 4999 iterations.
Fold 1 AUC: 0.87733

### Fold 2 Training ###
[LightGBM] [Info] Number of positive: 1132047, number of negative: 8071791
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1254
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Inf

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8527654	best: 0.8527654 (0)	total: 395ms	remaining: 32m 54s
1000:	test: 0.8796447	best: 0.8796447 (1000)	total: 4m 29s	remaining: 17m 57s
2000:	test: 0.8813141	best: 0.8813141 (1999)	total: 9m	remaining: 13m 29s
3000:	test: 0.8821759	best: 0.8821768 (2999)	total: 13m 27s	remaining: 8m 57s
4000:	test: 0.8827033	best: 0.8827033 (4000)	total: 17m 54s	remaining: 4m 28s
4999:	test: 0.8830464	best: 0.8830464 (4999)	total: 22m 25s	remaining: 0us
bestTest = 0.8830463886
bestIteration = 4999
Fold 2 AUC: 0.87723

### Fold 3 Training ###
[LightGBM] [Info] Number of positive: 1132047, number of negative: 8071791
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1255
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8526164	best: 0.8526164 (0)	total: 275ms	remaining: 22m 53s
1000:	test: 0.8798692	best: 0.8798692 (1000)	total: 6m 45s	remaining: 26m 59s
2000:	test: 0.8815796	best: 0.8815796 (2000)	total: 14m 45s	remaining: 22m 7s
3000:	test: 0.8823916	best: 0.8823921 (2998)	total: 22m 42s	remaining: 15m 7s
4000:	test: 0.8829439	best: 0.8829439 (4000)	total: 30m 42s	remaining: 7m 40s
4999:	test: 0.8832889	best: 0.8832891 (4997)	total: 38m 46s	remaining: 0us
bestTest = 0.8832890987
bestIteration = 4997
Shrink model to first 4998 iterations.
Fold 3 AUC: 0.87741

### Fold 4 Training ###
[LightGBM] [Info] Number of positive: 1132048, number of negative: 8071791
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 9203839, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Inf

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8525919	best: 0.8525919 (0)	total: 303ms	remaining: 25m 14s
1000:	test: 0.8796156	best: 0.8796156 (1000)	total: 4m 14s	remaining: 16m 54s
2000:	test: 0.8812644	best: 0.8812644 (1999)	total: 8m 29s	remaining: 12m 43s
3000:	test: 0.8820998	best: 0.8820998 (3000)	total: 12m 46s	remaining: 8m 30s
4000:	test: 0.8825966	best: 0.8825968 (3998)	total: 17m 5s	remaining: 4m 15s
4999:	test: 0.8829225	best: 0.8829225 (4999)	total: 21m 25s	remaining: 0us
bestTest = 0.8829224706
bestIteration = 4999
Fold 4 AUC: 0.87716

### Fold 5 Training ###
[LightGBM] [Info] Number of positive: 1132047, number of negative: 8071792
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1253
[LightGBM] [Info] Number of data points in the train set: 9203839, number of used features: 14
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGB

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8534671	best: 0.8534671 (0)	total: 263ms	remaining: 21m 53s
1000:	test: 0.8803353	best: 0.8803353 (1000)	total: 4m 14s	remaining: 16m 54s
2000:	test: 0.8819634	best: 0.8819634 (2000)	total: 8m 29s	remaining: 12m 43s
3000:	test: 0.8828566	best: 0.8828566 (3000)	total: 12m 46s	remaining: 8m 30s
4000:	test: 0.8833607	best: 0.8833607 (4000)	total: 17m 6s	remaining: 4m 16s
4999:	test: 0.8837300	best: 0.8837301 (4998)	total: 21m 26s	remaining: 0us
bestTest = 0.8837300539
bestIteration = 4998
Shrink model to first 4999 iterations.
Fold 5 AUC: 0.87780


Overall AUC: 0.87739 +/- 0.00022


In [35]:
## create submission
submission = test_df[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/insurance_selling/submission_240716.csv', index=False)
submission

OSError: Cannot save file into a non-existent directory: '\content\drive\MyDrive\Colab Notebooks\kaggle\insurance_selling'