<a href="https://colab.research.google.com/github/jihwanK/practice/blob/master/AIML/kaggle/playground/s4e7/kaggle_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash

mkdir /root/.kaggle
echo '{"username":"computerandgyein","key":"3b6ac6126c02fe055ad5ff7d321177b5"}' > /root/.kaggle/kaggle.json
chmod 600 /root/.kaggle/kaggle.json

# Preprocess

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

# import cupy as cp
import xgboost as xgb

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/Colab/practice/dataset/kaggle/s4e7_train.csv", usecols=lambda x: x != 'id') # filter out id
X_test = pd.read_csv("/content/drive/MyDrive/Colab/practice/dataset/kaggle/s4e7_test.csv", usecols=lambda x: x != 'id')
test_id = pd.read_csv("/content/drive/MyDrive/Colab/practice/dataset/kaggle/s4e7_test.csv", usecols=lambda x: x == 'id')

In [None]:
X, y = train_df.drop("Response", axis=1), train_df.loc[:,"Response"]

In [None]:
X["Vehicle_Damage"] = X["Vehicle_Damage"].apply(lambda x: int(x == "Yes"))
X["Gender"] = X["Gender"].apply(lambda x: int(x == "Male"))
X["Vehicle_Age"] = X["Vehicle_Age"].map({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2})
# X = pd.get_dummies(X, columns=['Vehicle_Age'], dtype=int)

X_test["Vehicle_Damage"] = X_test["Vehicle_Damage"].apply(lambda x: int(x == "Yes"))
X_test["Gender"] = X_test["Gender"].apply(lambda x: int(x == "Male"))
X_test["Vehicle_Age"] = X_test["Vehicle_Age"].map({"< 1 Year": 0, "1-2 Year": 1, "> 2 Years": 2})
# X_test = pd.get_dummies(X_test, columns=['Vehicle_Age'], dtype=int)

In [None]:
numerical_features = ['Age', 'Annual_Premium', 'Vintage']
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']

In [None]:
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=30000, random_state=42)

# Balanced Random Forest

In [None]:
hyperparameters = {
    'sampling_strategy': ["auto", "all"],
    'replacement': [False, True],
    'bootstrap': [True, False],
}

# model = BalancedRandomForestClassifier(device='gpu', n_estimators=1000, learning_rate=0.05, num_leaves=50, scale_pos_weight=10)
model = BalancedRandomForestClassifier(n_estimators=300, n_jobs=4)

model.fit(
    X_train,
    y_train,
)

print("\n[Validation Result]")
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

print(f"F1 score: {f1_score(y_val, y_pred)}")
print(f"ROC-AUC score: {roc_auc_score(y_val, y_prob)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}")

test_proba = model.predict_proba(X_test)[:, 1]
result_df = pd.concat([test_id, pd.DataFrame(test_proba, columns=["Response"])], axis=1)
result_df.to_csv(f"b_random_forest.csv", index=False)


# XGBoost

In [13]:
model = XGBClassifier(
    device='gpu',
    n_estimators=1000,
    learning_rate=0.01,
    scale_pos_weight=7
)

model.fit(
    X_train,
    y_train,
)

print("\n[Validation Result]")
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)
precision = precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f'Accuracy: {accuracy:.5f}')
print(f'Precision: {precision:.5f}')
print(f'ROC-AUC: {roc_auc:.5f}')
print(f'F1 Score: {f1:.5f}')
print(f'Confusion Matrix:\n {conf_matrix}')
print(f'Classification Report:\n {class_report} \n')

test_proba = model.predict_proba(X_test)[:, 1]
result_df = pd.concat([test_id, pd.DataFrame(test_proba, columns=["Response"])], axis=1)
result_df.to_csv(f"xgb-est3k-lr05-wght.csv", index=False)



[Validation Result]
Accuracy: 0.7265
Precision: 0.2959
ROC-AUC: 0.8780
F1 Score: 0.4476
Confusion Matrix:
 [[18470  7908]
 [  298  3324]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.70      0.82     26378
           1       0.30      0.92      0.45      3622

    accuracy                           0.73     30000
   macro avg       0.64      0.81      0.63     30000
weighted avg       0.90      0.73      0.77     30000
 



# Catboost

In [None]:
model = CatBoostClassifier(
    iterations=10_000,
    # scale_pos_weight=1.5,
    early_stopping_rounds=100,
    learning_rate=0.1,
    depth=7,
    verbose=500,
    eval_metric="AUC",
    # custom_metric=["Recall", "Precision"],
    task_type="GPU",
    devices='0',
)

model.fit(
    X_train,
    y_train,
)

print("\n[Validation Result]")
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_prob)
precision = precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f'Accuracy: {accuracy:.5f}')
print(f'Precision: {precision:.5f}')
print(f'ROC-AUC: {roc_auc:.5f}')
print(f'F1 Score: {f1:.5f}')
print(f'Confusion Matrix:\n {conf_matrix}')
print(f'Classification Report:\n {class_report} \n')

test_proba = model.predict_proba(X_test)[:, 1]
result_df = pd.concat([test_id, pd.DataFrame(test_proba, columns=["Response"])], axis=1)
result_df.to_csv(f"xgb-est3k-lr05-wght.csv", index=False)
