In [1]:
import warnings
warnings.filterwarnings('ignore')

import ast
from concurrent.futures import ThreadPoolExecutor
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier
import numpy as np

import pandas as pd
from pprint import pprint

import random

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from scipy import stats

import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')
# original = pd.read_csv(r'data\original_train.csv')

train.shape, test.shape

((11504798, 12), (7669866, 11))

In [3]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [4]:
ohe = pd.get_dummies(train, columns= bin_cols+cat_cols, drop_first=True, dtype='int')
ohe_test = pd.get_dummies(test, columns= bin_cols+cat_cols, drop_first=True, dtype='int')

In [5]:
ohe.shape, ohe_test.shape

((11504798, 13), (7669866, 12))

In [21]:
# Get train and test data
sss_data = ohe.copy()

sss_data['combined'] = sss_data['Vehicle_Damage_Yes'] + \
                        sss_data['Policy_Sales_Channel'] + \
                        sss_data[TARGET]

# Initialize ShuffleSplit
sss = StratifiedShuffleSplit(test_size=0.1, random_state=5)

# Get indices for the split
# Stratification is done on target variable
for train_index, test_index in sss.split(sss_data, sss_data['combined']):
    test_data = ohe.iloc[test_index]
    train_data = ohe.iloc[train_index]

train_data.shape, test_data.shape

# 1 minute

((10354318, 13), (1150480, 13))

In [22]:
# Assign X and y
X_complete = ohe.drop(drop_cols + [TARGET], axis=1)
y_complete = ohe[TARGET]

X = train_data.drop(drop_cols + [TARGET], axis=1)
y = train_data[TARGET]

val_X = test_data.drop(drop_cols + [TARGET], axis=1)
val_y = test_data[TARGET]

test_X = ohe_test.drop(drop_cols, axis=1)

X_complete.shape, y_complete.shape, X.shape, y.shape, val_X.shape, val_y.shape, test_X.shape

((11504798, 11),
 (11504798,),
 (10354318, 11),
 (10354318,),
 (1150480, 11),
 (1150480,),
 (7669866, 11))

In [8]:
# # Assign X and y
# X = ohe.drop(drop_cols + [TARGET], axis=1)
# y = ohe[TARGET]

# test_X = ohe_test.drop(drop_cols, axis=1)

# X.shape, y.shape, test_X.shape

In [26]:
smote = SMOTE(sampling_strategy='auto', random_state=5)

X_resampled, y_resampled = smote.fit_resample(val_X, val_y)
pd.Series(y_resampled).value_counts()

# 90 seconds

Response
1    1008995
0    1008995
Name: count, dtype: int64

In [10]:
# smote_comp = SMOTE(sampling_strategy='auto', random_state=5)

# X_resampled_comp, y_resampled_comp = smote_comp.fit_resample(X_complete, y_complete)
# pd.Series(y_resampled_comp).value_counts()

# # 90 seconds

In [11]:
n_splits = 3

sk3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [12]:
# class_weights = {0: 0.5701236672227101, 1: 4.065130146516859}
class_weights = {0: 0.8770027079136896, 1: 0.12299729208631043}

In [13]:
# Define pipelines

logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression()),
])

# Manually set pipeline names
logistic_pipeline.name = 'Logistic Scale'

In [14]:
# Define classification models
classif_models = [
    # LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', class_weight=class_weights),
    LGBMClassifier(random_state=5, objective='binary'),
    # LogisticRegression(),
    # logistic_pipeline,
]

In [15]:
def classif_evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train ROC AUC', 
                                        'Model Test ROC AUC', 
                                        'Model Test ROC AUC Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train ROC AUC': 0,
                'Model Test ROC AUC': 0,
                'Model Test ROC AUC Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train ROC AUC': cv_results['train_score'].mean(),
            'Model Test ROC AUC': cv_results['test_score'].mean(),
            'Model Test ROC AUC Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test ROC AUC'], ascending=False, inplace=True)
    model_compare.to_csv(f'results\{experiment_name}.csv', index=False)

    return model_compare

In [27]:
baseline_features_classif = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features_classif[model_name] = list(X_resampled.columns)

In [28]:
%%time

baseline_models_classif = classif_evaluate_models(classif_models, X_resampled, y_resampled, baseline_features_classif, sk3, f'{experiment_name}')
baseline_models_classif

# 7 minutes

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 281 ms
Wall time: 42.7 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.955609,0.955435,7.3e-05,0 min 18.01 sec


### LGBM Validation

*Proven to be close to the PL on Kaggle*

In [29]:
model = LGBMClassifier(random_state=5, objective='binary', metric='auc')

model.fit(X_resampled, y_resampled)

In [30]:
val_pred = model.predict_proba(X)[:, 1]
val_score = roc_auc_score(y, val_pred)
val_score

# 0.8748547824131868

0.8509822665146346

### LGBM Submission Prediction

In [None]:
model = LGBMClassifier(random_state=5, objective='binary')

model.fit(X_complete, y_complete)

In [None]:
# pred = model.predict(test_X)
pred = model.predict_proba(test_X)[:, 1]
pred_df = pd.DataFrame(pred, columns=[TARGET])

In [None]:
pred_df.head()

In [None]:
submission = pd.concat([test['id'], pred_df[TARGET]], axis=1)

In [None]:
submission.tail()

In [None]:
# Check winning_route.txt for what the result steps are
submission.to_csv(r'submissions/result_1_b.csv', index=False)