In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import auc as auc_calc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from ucimlrepo import fetch_ucirepo 
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier,RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

from catboost import CatBoostRegressor, Pool, CatBoostClassifier

# データ整備

In [2]:
# Load the dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape,test.shape)
train["data"]="train"
test["data"]="test"
target="Response"
test[target]=-1
data=pd.concat([train,test]).reset_index(drop=True)


(11504798, 12) (7669866, 11)


In [3]:
cat_col=["Gender","Driving_License","Region_Code",
        "Previously_Insured","Vehicle_Age","Vehicle_Damage","Policy_Sales_Channel",
        ]
def new_features(data_input):

    num_col=["Age","Annual_Premium","Vintage","Response"]
    df=data_input.copy()
    # cat_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
    # df[cat_features] = df[cat_features].astype('category')
    # df["Vehicle_Age"] = df["Vehicle_Age"].cat.rename_categories({"1-2 Year": 1, "< 1 Year": 0, "> 2 Years": 2}).astype('int8')
    # df['Gender'] = df['Gender'].cat.rename_categories({'Male': 0, 'Female': 1}).astype('int8')
    # df['Vehicle_Damage'] = df['Vehicle_Damage'].cat.rename_categories({'No': 0, 'Yes': 1}).astype('int8')
    label_encoders = {}
    for col in cat_col:
        le= LabelEncoder()
        df[col] = le.fit_transform(df[col])
    for col in num_col:
        df[col]=df[col].astype(int)
    return df
data_new_feature=new_features(data)

In [4]:
df_train=data_new_feature[data_new_feature["data"]=="train"].drop(columns=["data"])
df_test=data_new_feature[data_new_feature["data"]=="test"].drop(columns=["data",'Response'])

In [5]:
df_train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,1,21,1,35,0,0,1,65101,119,187,0
1,1,1,43,1,28,0,2,1,58911,25,288,1
2,2,0,25,1,14,1,1,0,38043,144,254,0
3,3,0,35,1,1,0,0,1,2630,148,76,0
4,4,0,36,1,15,1,0,0,31951,144,294,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,11504793,1,48,1,6,0,0,1,27412,25,218,0
11504794,11504794,0,26,1,36,0,1,1,29509,144,115,1
11504795,11504795,0,29,1,32,1,1,0,2630,144,189,0
11504796,11504796,0,51,1,28,0,0,1,48443,25,274,1


In [6]:
# Separate features and target
X = df_train.drop(columns=['Response',"id"])
y = df_train['Response']

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# OOF

In [7]:
# X_input=X.head(10000).copy()
# y_input=y.head(10000).copy()
# test_input=df_test.head(10000).copy().drop(columns=["id"])
# n_splits=2


In [8]:
# アウトオブフォールド予測の収集
def get_out_of_fold_predictions(X_input, y_input,test_input,params_input,n_splits=5):
    # GroupKFoldの初期化
    oof_predictions = np.zeros([X.shape[0]])
    oof_test = np.zeros((test_input.shape[0], n_splits))
    aucs = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # 各分割のデータインデックスを出力
    for fold, (train_idx, val_index) in enumerate(skf.split(X_input, y_input)):
        print(f"Fold {fold + 1}")
        # データを分割
        X_train, X_val = X_input.iloc[train_idx], X_input.iloc[val_index]
        y_train, y_val = y_input.iloc[train_idx], y_input.iloc[val_index]
        train_pool = Pool(data=X_train, label=y_train, cat_features=cat_col)
        val_pool = Pool(data=X_val, label=y_val, cat_features=cat_col)
        test_pool = Pool(data=test_input,  cat_features=cat_col)
        
        # LGBMClassifierの初期化
        evals_result = {} 
        model = CatBoostClassifier(**params_input)
        # モデルのトレーニング
        model.fit(train_pool, eval_set=[train_pool,val_pool], use_best_model=True, plot=True) 
        # モデルのトレーニング
        
        y_pred = model.predict_proba(X_val)[:,1]
        
        # 精度の計算
        auc = roc_auc_score(y_val, y_pred)
        print(f'AUC: {auc:.4f}')
        aucs.append(auc)
        
        oof_predictions[val_index] = y_pred
        oof_test[:,fold]=model.predict_proba(test_input)[:,1]
    # ROC曲線を計算
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    # AUCを計算
    roc_auc = auc_calc(fpr, tpr)
    # ROC曲線をプロット
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

    # 学習曲線の表示
    results = model.get_evals_result()
    # AUCの学習曲線を描画
    plt.figure(figsize=(10, 5))
    plt.plot(results['learn']['Logloss'], label='Train Logloss')
    plt.plot(results['validation_0']['Logloss'], label='Validation Logloss')
    plt.plot(results['validation_1']['Logloss'], label='Validation1 Logloss')
    plt.xlabel('Iterations')
    plt.ylabel('AUC')
    plt.title('Learning Curve')
    plt.legend()
    plt.grid()
    plt.show()



    # 特徴量の重要度の取得
    feature_importances = model.get_feature_importance(type='FeatureImportance')
    feature_names = X.columns
    # 特徴量の重要度をデータフレームに変換
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values("Importance",ascending=False)
    # 特徴量の重要度を表示
    plt.figure(figsize=(12, 6))
    plt.title('Feature Importance')
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.gca().invert_yaxis()
    plt.show()
    # 平均精度の表示
    mean_accuracy = np.mean(aucs)

    print(aucs)
    return oof_predictions,oof_test

In [9]:
test_x=df_test.copy().drop(columns=["id"])
params= {
"iterations":1000, 
"learning_rate":0.5,
"depth":10,
"eval_metric":'AUC', 
"verbose":100,
"early_stopping_rounds":50,
'random_state': 42,
#'task_type': 'GPU',
}
# アウトオブフォールド予測の取得
oof_train,oof_test = get_out_of_fold_predictions(X,y,test_x,params)

Fold 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8445460	test1: 0.8447087	best: 0.8447087 (0)	total: 14.6s	remaining: 4h 3m 12s
100:	test: 0.8800886	test1: 0.8790657	best: 0.8790657 (100)	total: 11m 28s	remaining: 1h 42m 7s
200:	test: 0.8821057	test1: 0.8799110	best: 0.8799110 (200)	total: 22m 4s	remaining: 1h 27m 46s
300:	test: 0.8834533	test1: 0.8799893	best: 0.8799901 (299)	total: 33m 35s	remaining: 1h 17m 59s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8800234787
bestIteration = 317

Shrink model to first 318 iterations.
AUC: 0.8800
Fold 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.8446078	test1: 0.8447239	best: 0.8447239 (0)	total: 5.68s	remaining: 1h 34m 32s
100:	test: 0.8803359	test1: 0.8790211	best: 0.8790211 (100)	total: 10m 3s	remaining: 1h 29m 31s


CatBoostError: bad allocation

In [None]:
# df_oof_train=pd.DataFrame(oof_train,columns=["y_pred_lgbm"])
# df_oof_train.to_csv(r"y_pred_train_lgbm_0704_lr01.csv")

In [None]:
df_oof_test=pd.DataFrame(np.mean(oof_test,axis=1),columns=["y_pred_catd"])
df_oof_test=pd.merge(test["id"],df_oof_test,left_index=True,right_index=True)
df_oof_test.to_csv(r"y_pred_test_cat_0706.csv",index=False)

# ハイパーパラメータチューニング

In [None]:

# import optuna

# X_optuna=X.copy()
# y_optuna=y.copy()

# # Objective関数の定義
# def objective(trial):
#     params_optuna = {
#         'objective': 'binary',
#         'metric': 'auc',  # AUCを評価指標として設定
#         'boosting_type': 'gbdt',
#         'num_leaves': trial.suggest_int('num_leaves', 10, 100),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
#         "n_estimators":1000,
#         'verbose': -1
#     }
#     n_splits=5
#     aucs = []
#     skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
#     # 各分割のデータインデックスを出力
#     for fold, (train_idx, val_index) in enumerate(skf.split(X_optuna, y_optuna)):
#         print(f"Fold {fold + 1}")
        
#         # データを分割
#         X_train, X_val = X_optuna.iloc[train_idx], X_optuna.iloc[val_index]
#         y_train, y_val = y_optuna.iloc[train_idx], y_optuna.iloc[val_index]
#         train_data = lgb.Dataset(X_train, label=y_train)
#         val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

#         # LGBMClassifierの初期化
#         num_round = 100
#         evals_result = {} 
#         model = LGBMClassifier(**params_optuna, importance_type='gain')
#         # モデルのトレーニング
#         model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_val, y_val)],eval_names=["train","valid"],eval_metric='auc',callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]) 
#         # モデルのトレーニング
#         y_pred = model.predict_proba(X_val)[:,1]
#         # 精度の計算
#         auc = roc_auc_score(y_val, y_pred)
#         print(f'AUC: {auc:.4f}')
#         aucs.append(auc)
#     return np.mean(aucs)

# # Optunaでハイパーパラメータの最適化を実行
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

# # 最適化されたハイパーパラメータを取得
# best_params = study.best_params