## Load the path

In [3]:
import os


class Config:
    def __init__(self):
        """Dataset
        Load and process the data.
        
        """
        self.train_path: str = os.path.join(os.getcwd(), "contents", "train.csv")
        self.test_path: str = os.path.join(os.getcwd(), "contents", "test.csv")
        self.submission_path: str = os.path.join(os.getcwd(), "contents", "sample_submission.csv")

## Preprocessing data

In [4]:
import pandas as pd


class Dataset:
    def __init__(self, path):
        """Dataset
        
        Arg:
            path (str): Data path
        """
        print('Load the data...')
        # read the data
        self.df = pd.read_csv(path)
    
    def head(self, n):
        """Return the first n rows of the DataFrame"""
        return self.df.head(n)

    def preprocessing(self):
        """Preprocess the data
        
        Returns:
            Dataframe: Preprocessed data
        """
        print('Preprocess the data')
        # str to integer; not category
        # TODO: we need to use category
        preprocessed_data = self.df.copy()
        preprocessed_data['Gender'] = preprocessed_data['Gender'].replace(['Female', 'Male'], [0, 1])
        preprocessed_data['Vehicle_Age'] = preprocessed_data['Vehicle_Age'].replace(['< 1 Year', '1-2 Year', '> 2 Years'], [0, 1, 2])
        preprocessed_data['Vehicle_Damage'] = preprocessed_data['Vehicle_Damage'].replace(['No', 'Yes'], [0, 1])

        # data quantization
        try:
            preprocessed_data[['Gender', 'Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Response', 'Policy_Sales_Channel']] = preprocessed_data[['Gender', 'Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Response', 'Policy_Sales_Channel']].astype('uint8')
        except:
            preprocessed_data[['Gender', 'Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']] = preprocessed_data[['Gender', 'Age', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']].astype('uint8')

        preprocessed_data[['Vintage', 'Annual_Premium']] = preprocessed_data[['Vintage', 'Annual_Premium']].astype('uint16')
        preprocessed_data['Region_Code'] = preprocessed_data['Region_Code'] * 10
        preprocessed_data['Region_Code'] = preprocessed_data['Region_Code'].astype('uint8')
        print('Finished!')
        
        return preprocessed_data
    
    def preprocessing_category(self):
        """Preprocess the data
        
        Returns:
            Dataframe: Preprocessed data
        """
        print('Preprocess the data')
        preprocessed_data = self.df.copy()

        # Define categorical columns
        categorical_columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

        # Convert the columns to categorical type
        for col in categorical_columns:
            preprocessed_data[col] = preprocessed_data[col].astype('category')

        # Convert numeric columns to appropriate types
        preprocessed_data[['Vintage', 'Annual_Premium']] = preprocessed_data[['Vintage', 'Annual_Premium']].astype('uint16')
        
        # Scaling region code by 10 and converting to uint8
        preprocessed_data['Region_Code'] = (preprocessed_data['Region_Code'] * 10).astype('uint8')

        print('Finished!')
        return preprocessed_data, categorical_columns

## Load train data

In [6]:
# Load the data path
config = Config()

# Preprocessing the data
train_data = Dataset(config.train_path)
train_df = train_data.preprocessing()

Load the data...
Preprocess the data
Finished!


## Print data

In [None]:
train_df.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,1,21,1,94,0,1,1,65101,124,187,0
1,1,1,43,1,24,0,2,1,58911,26,288,1
2,2,0,25,1,140,1,0,0,38043,152,254,0
3,3,0,35,1,10,0,1,1,2630,156,76,0
4,4,0,36,1,150,1,1,0,31951,152,294,0


## Split data

In [None]:
from sklearn.model_selection import train_test_split
train_df = train_df.drop('id', axis=1)

y = train_df['Response']
X = train_df.drop('Response', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

## Train the LightGBM

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lgbm_model = LGBMClassifier(random_state=41, n_estimators=500)

evals = [(X_test, y_test)]

# 모델 학습
lgbm_model.fit(X_train, 
               y_train,                  
            #    early_stopping_rounds = 100, 
               eval_metric = "logloss", 
               eval_set = evals,
               # random_state=1, 
               verbose=True)


y_pred = lgbm_model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f"roc auc value: {auc}")



[1]	valid_0's binary_logloss: 0.354153
[2]	valid_0's binary_logloss: 0.340275
[3]	valid_0's binary_logloss: 0.329157
[4]	valid_0's binary_logloss: 0.319923
[5]	valid_0's binary_logloss: 0.312375
[6]	valid_0's binary_logloss: 0.305916
[7]	valid_0's binary_logloss: 0.300472
[8]	valid_0's binary_logloss: 0.295756
[9]	valid_0's binary_logloss: 0.291669
[10]	valid_0's binary_logloss: 0.288058
[11]	valid_0's binary_logloss: 0.284902
[12]	valid_0's binary_logloss: 0.282047
[13]	valid_0's binary_logloss: 0.279649
[14]	valid_0's binary_logloss: 0.277573
[15]	valid_0's binary_logloss: 0.275476
[16]	valid_0's binary_logloss: 0.273849
[17]	valid_0's binary_logloss: 0.272207
[18]	valid_0's binary_logloss: 0.270874
[19]	valid_0's binary_logloss: 0.269638
[20]	valid_0's binary_logloss: 0.268453
[21]	valid_0's binary_logloss: 0.267493
[22]	valid_0's binary_logloss: 0.266502
[23]	valid_0's binary_logloss: 0.265674
[24]	valid_0's binary_logloss: 0.26494
[25]	valid_0's binary_logloss: 0.264165
[26]	valid

## Save the model

In [None]:
import joblib

joblib.dump(lgbm_model, 'models/lgbm_model.pkl') 

['models/lgbm_model.pkl']

In [7]:
import joblib

lgbm_model = joblib.load('models/lgbm_model.pkl') 

## Train the Xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Define the XGBoost classifier model
xgboost_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=16,
    learning_rate=0.01,
    random_state=41, 
    objective='binary:logistic',
)

# Prepare the evaluation metric and monitoring set
eval_set = [(X_train, y_train), (X_test, y_test)]

# Fit the model with evaluation metrics
xgboost_model.fit(
    X_train, 
    y_train, 
    eval_set=eval_set,
    # eval_metric="logloss",  # Use a single string instead of a list
    verbose=True  # Prints evaluation results after every boosting round
)

# Evaluate the model on the test data (optional)
y_pred = xgboost_model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f"roc auc value: {auc}")

[0]	validation_0-logloss:0.38378	validation_1-logloss:0.38325
[1]	validation_0-logloss:0.38202	validation_1-logloss:0.38153
[2]	validation_0-logloss:0.38030	validation_1-logloss:0.37985
[3]	validation_0-logloss:0.37860	validation_1-logloss:0.37819
[4]	validation_0-logloss:0.37694	validation_1-logloss:0.37657
[5]	validation_0-logloss:0.37530	validation_1-logloss:0.37497
[6]	validation_0-logloss:0.37370	validation_1-logloss:0.37340
[7]	validation_0-logloss:0.37212	validation_1-logloss:0.37186
[8]	validation_0-logloss:0.37056	validation_1-logloss:0.37034
[9]	validation_0-logloss:0.36903	validation_1-logloss:0.36885
[10]	validation_0-logloss:0.36753	validation_1-logloss:0.36738
[11]	validation_0-logloss:0.36605	validation_1-logloss:0.36594
[12]	validation_0-logloss:0.36460	validation_1-logloss:0.36452
[13]	validation_0-logloss:0.36317	validation_1-logloss:0.36313
[14]	validation_0-logloss:0.36176	validation_1-logloss:0.36175
[15]	validation_0-logloss:0.36038	validation_1-logloss:0.36040
[1

## Save the xgboost model

In [None]:
import joblib

joblib.dump(xgboost_model, 'models/xgboost_model.pkl') 

['models/xgboost_model.pkl']

In [8]:
import joblib

xgboost_model = joblib.load('models/xgboost_model.pkl') 

## Train the Catboost

In [9]:
# from sklearn.model_selection import train_test_split

# # Preprocessing the data
# train_data = Dataset(config.train_path)
# train_df_cate, categorical_columns = train_data.preprocessing_category()

# train_df_cate = train_df_cate.drop('id', axis=1)

# y = train_df_cate['Response']
# X = train_df_cate.drop('Response', axis=1)

# X_train_cate, X_test_cate, y_train_cate, y_test_cate = train_test_split(X, y, test_size=0.2, random_state=41)

Load the data...
Preprocess the data
Finished!


: 

In [10]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# cat_features = [train_df_cate.columns.get_loc(col) for col in categorical_columns]

catboost_model = CatBoostClassifier(iterations=500,
                        #    cat_features=cat_features,
                           depth=16,
                           learning_rate=0.01,
                           loss_function='Logloss',
                           random_state=41, 
                           verbose=True)

catboost_model.fit(X_train, y_train)

# Evaluate the model on the test data (optional)
y_pred = catboost_model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(f"roc auc value: {auc}")

0:	learn: 0.6717141	total: 4.91s	remaining: 40m 49s
1:	learn: 0.6514689	total: 7.1s	remaining: 29m 28s
2:	learn: 0.6320136	total: 11.4s	remaining: 31m 31s
3:	learn: 0.6136876	total: 15.7s	remaining: 32m 24s
4:	learn: 0.5962660	total: 20.4s	remaining: 33m 38s
5:	learn: 0.5797863	total: 22.6s	remaining: 31m 4s
6:	learn: 0.5640910	total: 27.2s	remaining: 31m 57s
7:	learn: 0.5491933	total: 31.4s	remaining: 32m 10s
8:	learn: 0.5350538	total: 35.5s	remaining: 32m 17s
9:	learn: 0.5216996	total: 39.7s	remaining: 32m 25s
10:	learn: 0.5091697	total: 43.9s	remaining: 32m 31s
11:	learn: 0.4973134	total: 48.3s	remaining: 32m 44s


## Save the catboost model

In [None]:
import joblib

joblib.dump(catboost_model, 'models/catboost_model.pkl') 

['models/catboost_model.pkl']

In [None]:
import joblib

catboost_model = joblib.load('models/catboost_model.pkl') 

## Probability

In [None]:
lgbm_proba = pd.Series(lgbm_model.predict_proba(X_test)[:, 1], name='lgbm')
xgboost_proba = pd.Series(xgboost_model.predict_proba(X_test)[:, 1], name='xgboost')
catboost_proba = pd.Series(catboost_model.predict_proba(X_test_cate)[:, 1], name='catboost')

result = pd.concat([lgbm_proba, catboost_proba, xgboost_proba], axis=1)
result.head(5)

Unnamed: 0,lgbm,catboost,xgboost
0,0.172179,0.369445,0.15601
1,0.000149,0.302822,0.001345
2,0.000157,0.302817,0.001345
3,0.326306,0.42629,0.42377
4,0.000166,0.302821,0.001345


## Train the Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler

# Standardizing the features
scaler = RobustScaler() # Premium, Age만 사용 나머지는 Standard Scaler 로 진행하기.
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the neural network model
nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(3,)),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

# Fit the model
nn_model.fit(result, y_test, epochs=10, batch_size=128, validation_data=(result, y_test))

# Evaluate the model on the test data
y_pred = nn_model.predict(result)

accuracy = roc_auc_score(y_test, y_pred)
print(f"roc auc value: {auc}")

2024-07-26 18:37:29.100051: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10


2024-07-26 18:37:33.264798: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-26 18:37:33.498828: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
roc auc value: 0.5


In [None]:
y_pred.shape

(2300960, 1)

## Load test data

In [None]:
# Preprocessing the data
test_data = Dataset(config.test_path)
test_df = test_data.preprocessing()
test_df = test_df.drop('id', axis=1)

Load the data...
Preprocess the data
Finished!


In [None]:
test_data.head(5)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,11504798,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,11504799,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,11504800,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,11504801,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,11504802,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


## Make submission.csv

In [None]:

lgbm_proba = pd.Series(lgbm_model.predict_proba(test_df)[:, 1], name='lgbm')
catboost_proba = pd.Series(catboost_model.predict_proba(test_df)[:, 1], name='catboost')
xgboost_proba = pd.Series(xgboost_model.predict_proba(test_df)[:, 1], name='xgboost')

print('Finished tree model inference!')

result = pd.concat([lgbm_proba, catboost_proba, xgboost_proba], axis=1)
y_pred = nn_model.predict(result)

Finished tree model inference!


In [None]:
id_ = test_data.preprocessing()
submission = pd.DataFrame({'id': id_['id'], 'Response': y_pred[:, 0]})

submission.to_csv('submission.csv', index=False)
print('Predictions saved to submission.csv')

Preprocess the data
Finished!
Predictions saved to submission.csv
