In [13]:
import warnings
warnings.filterwarnings('ignore')

import ast
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np

import pandas as pd
from pprint import pprint

import random

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from scipy import stats

import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

experiment_name = 'synthetic_original'

In [2]:
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')
original = pd.read_csv(r'data\original_train.csv')

train.shape, test.shape, original.shape

((11504798, 12), (7669866, 11), (381109, 12))

In [3]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [4]:
combined_df = pd.concat([train, original], axis=0, ignore_index=True)

In [5]:
ohe = pd.get_dummies(combined_df, columns= bin_cols+cat_cols, drop_first=True, dtype='int')

In [6]:
ohe.shape

(11885907, 13)

In [7]:
# Assign X and y
X = ohe.drop(drop_cols + [TARGET], axis=1)
y = ohe[TARGET]

X.shape, y.shape

((11885907, 11), (11885907,))

In [8]:
n_splits = 3

sk3 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [9]:
# Define pipelines

logistic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression()),
])

# Manually set pipeline names
logistic_pipeline.name = 'Logistic Scale'

In [10]:
# Define classification models
classif_models = [
    LGBMClassifier(n_jobs=-1, random_state=5, objective='binary'),
    # LogisticRegression(),
    logistic_pipeline,
]

In [11]:
def classif_evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train ROC AUC', 
                                        'Model Test ROC AUC', 
                                        'Model Test ROC AUC Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train ROC AUC': 0,
                'Model Test ROC AUC': 0,
                'Model Test ROC AUC Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train ROC AUC': cv_results['train_score'].mean(),
            'Model Test ROC AUC': cv_results['test_score'].mean(),
            'Model Test ROC AUC Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test ROC AUC'], ascending=False, inplace=True)
    model_compare.to_csv(f'results\{experiment_name}.csv', index=False)

    return model_compare

In [12]:
baseline_features_classif = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features_classif[model_name] = list(X.columns)

In [14]:
%%time

baseline_models_classif = classif_evaluate_models(classif_models, X, y, baseline_features_classif, sk3, f'{experiment_name}')
baseline_models_classif

Models:   0%|          | 0/2 [00:00<?, ?it/s]

Progress:   0%|          | 0/2 [00:00<?, ?it/s]

Done with Logistic Scale.
Done with LGBMClassifier.
CPU times: total: 4.73 s
Wall time: 5min 11s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.87505,0.874902,0.000185,3 min 8.20 sec
1,Logistic Scale,"{'memory': None, 'steps': [('scaler', Standard...",0.841155,0.841152,0.00015,1 min 47.60 sec
