In [1]:
import warnings
warnings.filterwarnings('ignore')

import ast
from autogluon.tabular import TabularPredictor
from category_encoders import TargetEncoder
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np

import pandas as pd
from pprint import pprint

import random

from sklearn.feature_selection import mutual_info_classif, RFECV
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline

from scipy import stats

import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

train.shape, test.shape

((11504798, 12), (7669866, 11))

In [3]:
train.sample(3)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
4428844,4428844,Male,38,1,17.0,0,1-2 Year,Yes,24573.0,124.0,117,1
7344254,7344254,Female,38,1,37.0,0,1-2 Year,Yes,2630.0,157.0,260,0
4271212,4271212,Male,55,1,28.0,1,1-2 Year,No,39711.0,26.0,78,0


In [4]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Annual_Premium', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
needs_dummies = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [5]:
ohe = pd.get_dummies(train, columns=needs_dummies, drop_first=True, dtype='int')
ohe_test = pd.get_dummies(test, columns=needs_dummies, drop_first=True, dtype='int')
ohe.head()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender_Male,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
0,0,21,1,35.0,0,65101.0,124.0,187,0,1,0,0,1
1,1,43,1,28.0,0,58911.0,26.0,288,1,1,0,1,1
2,2,25,1,14.0,1,38043.0,152.0,254,0,0,1,0,0
3,3,35,1,1.0,0,2630.0,156.0,76,0,0,0,0,1
4,4,36,1,15.0,1,31951.0,152.0,294,0,0,0,0,0


In [6]:
# Drop the features not required
ohe_drop = ohe.drop(drop_cols, axis=1)
ohe_drop_test = ohe_test.drop(drop_cols, axis=1)

In [7]:
ohe_drop.shape, ohe_drop_test.shape

((11504798, 12), (7669866, 11))

In [8]:
ohe_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Age                    int64  
 1   Driving_License        int64  
 2   Region_Code            float64
 3   Previously_Insured     int64  
 4   Annual_Premium         float64
 5   Policy_Sales_Channel   float64
 6   Vintage                int64  
 7   Response               int64  
 8   Gender_Male            int32  
 9   Vehicle_Age_< 1 Year   int32  
 10  Vehicle_Age_> 2 Years  int32  
 11  Vehicle_Damage_Yes     int32  
dtypes: float64(3), int32(4), int64(5)
memory usage: 877.7 MB


In [9]:
# Initialize ShuffleSplit
# Get 5% of the data as the training data and rest as test
sss = StratifiedShuffleSplit(test_size=0.05, random_state=5)

# Get indices for the split
# Stratification is done on target variable
for train_index, test_index in sss.split(ohe_drop, ohe_drop[TARGET]):
    train_data = ohe_drop.iloc[test_index]
    test_data = ohe_drop.iloc[train_index]

train_data.shape, test_data.shape

# 1 minute

((575240, 12), (10929558, 12))

In [10]:
# Assign X and y
X = train_data.drop(TARGET, axis=1)
y = train_data[TARGET]

val_X = test_data.drop(TARGET, axis=1)
val_y = test_data[TARGET]

test_X = ohe_drop_test.copy()

X.shape, y.shape, val_X.shape, val_y.shape, test_X.shape

((575240, 11), (575240,), (10929558, 11), (10929558,), (7669866, 11))

In [11]:
n_splits = 10

sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [12]:
# Define classification models
classif_models = [
    LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc'),
]

In [13]:
def classif_evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train ROC AUC', 
                                        'Model Test ROC AUC', 
                                        'Model Test ROC AUC Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train ROC AUC': 0,
                'Model Test ROC AUC': 0,
                'Model Test ROC AUC Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train ROC AUC': cv_results['train_score'].mean(),
            'Model Test ROC AUC': cv_results['test_score'].mean(),
            'Model Test ROC AUC Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test ROC AUC'], ascending=False, inplace=True)
    model_compare.to_csv(f'results\{experiment_name}.csv', index=False)

    return model_compare

In [14]:
baseline_features_classif = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features_classif[model_name] = list(X.columns)

In [15]:
%%time

baseline_models_classif = classif_evaluate_models(classif_models, X, y, baseline_features_classif, sk10, f'{experiment_name}')
baseline_models_classif

# 1 minute

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 281 ms
Wall time: 40.1 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.875302,0.872437,0.001456,0 min 13.13 sec


### LGBM Validation

*Proven to be close to the PL on Kaggle*

In [16]:
model = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc')

model.fit(X, y)

In [17]:
val_pred = model.predict_proba(val_X)[:, 1]
val_score = roc_auc_score(val_y, val_pred)
val_score

0.8721405023726595

### LGBM Submission Prediction

In [18]:
model = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc')

model.fit(ohe_drop.drop(TARGET, axis=1), ohe_drop[TARGET])

In [19]:
pred = model.predict_proba(test_X)[:, 1]
pred_df = pd.DataFrame(pred, columns=[TARGET])

In [20]:
pred_df.head()

Unnamed: 0,Response
0,0.01407
1,0.355024
2,0.246344
3,0.00022
4,0.031651


In [21]:
submission = pd.concat([test['id'], pred_df[TARGET]], axis=1)

In [22]:
submission.tail()

Unnamed: 0,id,Response
7669861,19174659,0.201896
7669862,19174660,0.000265
7669863,19174661,0.00045
7669864,19174662,0.549367
7669865,19174663,0.000234


In [23]:
# Check winning_route.txt for what the result steps are
submission.to_csv(r'submissions/experiment_1_lgbm.csv', index=False)

### Autogluon

In [31]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Annual_Premium', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
needs_dummies = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [32]:
# Initialize ShuffleSplit
# Get 5% of the data as the training data and rest as test
sss = StratifiedShuffleSplit(test_size=0.05, random_state=5)

# Get indices for the split
# Stratification is done on target variable
for train_index, test_index in sss.split(train, train[TARGET]):
    autogluon_train_data = train.iloc[test_index]
    # test_data = ohe_drop.iloc[train_index]

train_data.shape
# 1 minute

(575240, 12)

In [37]:
# Drop Unneeded column
autogluon_train_data = autogluon_train_data.drop(drop_cols, axis=1)
autogluon_train_data.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
2698469,Male,53,1,28.0,1,1-2 Year,No,43134.0,26.0,107,0
7618126,Male,52,1,28.0,0,1-2 Year,Yes,44346.0,26.0,258,0
10678,Male,22,1,28.0,0,< 1 Year,Yes,27173.0,152.0,271,0
10177923,Female,42,1,28.0,0,1-2 Year,Yes,52740.0,26.0,98,0
5137129,Male,27,1,6.0,1,< 1 Year,No,26127.0,152.0,254,0


In [38]:
# Force convert to categorical
for col in cat_cols + bin_cols:
    autogluon_train_data[col] = autogluon_train_data[col].astype('category')

In [39]:
autogluon_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575240 entries, 2698469 to 4633953
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Gender                575240 non-null  category
 1   Age                   575240 non-null  int64   
 2   Driving_License       575240 non-null  category
 3   Region_Code           575240 non-null  category
 4   Previously_Insured    575240 non-null  category
 5   Vehicle_Age           575240 non-null  category
 6   Vehicle_Damage        575240 non-null  category
 7   Annual_Premium        575240 non-null  float64 
 8   Policy_Sales_Channel  575240 non-null  category
 9   Vintage               575240 non-null  int64   
 10  Response              575240 non-null  int64   
dtypes: category(7), float64(1), int64(3)
memory usage: 26.3 MB


In [40]:
TIME_LIMIT = 60

def delete_autogluon_file():
    directory = 'AutogluonModels'
    filelist = [f for f in os.listdir(directory)]
    for file in filelist:
        file_path = os.path.join(directory, file)
        # Check if the file is a regular file
        if os.path.isfile(file_path):
            # Delete the file
            os.remove(file_path)
        # Check if the file is a directory
        elif os.path.isdir(file_path):
            # Delete the directory and its contents recursively
            shutil.rmtree(file_path)

In [25]:
missing_columns = set(X.columns) - set(test_X.columns)
missing_columns

set()

In [27]:
ohe_drop.shape, ohe_drop_test.shape, train_data.shape, test_data.shape

((11504798, 12), (7669866, 11), (575240, 12), (10929558, 12))

In [41]:
autogluon_roc_scores = []

for fold, (train_index, test_index) in enumerate(sk10.split(X, y)):
    # Split the dataset into train and test sets
    auto_train_data = autogluon_train_data.iloc[train_index]
    auto_test_data = autogluon_train_data.iloc[test_index]

    # Print the shapes of train and test data for debugging
    print(f'Fold {fold + 1} - Train data shape: {auto_train_data.shape}, Test data shape: {auto_test_data.shape}')

    predictor = TabularPredictor(problem_type='binary', 
                                 label=TARGET, 
                                 eval_metric='roc_auc', 
                                 verbosity=1)

    predictor.fit(train_data=auto_train_data,
                  presets='medium_quality',
                  time_limit=TIME_LIMIT,
                  # num_bag_folds=5, 
                  # num_bag_sets=1, 
                  # num_stack_levels=3,
                  feature_prune_kwargs={'force_prune': True}
    )

    # Get the prediction
    performance = predictor.predict_proba(auto_train_data.drop(TARGET, axis=1))[:, 1]

    # Calculate roc
    roc = roc_auc_score(auto_test_data[TARGET], performance)

    print(f'Autogluon Fold {fold + 1} - ROC: {roc}')
    print()

    autogluon_roc_scores.append(roc)

    # Delete the models because of memory
    delete_autogluon_file()

# Print the ROC AUC scores for each fold
print('Autogluon ROC Mean:', np.mean(autogluon_roc_scores))
print('Autogluon ROC STD:', np.std(autogluon_roc_scores))

# Autogluon Fold 1 - roc: 
# Autogluon Fold 2 - roc: 
# Autogluon Fold 3 - roc: 

No path specified. Models will be saved in: "AutogluonModels\ag-20240716_172437\"


Fold 1 - Train data shape: (517716, 11), Test data shape: (57524, 11)


Insufficient time to train even a single feature pruning model (remaining: 0, needed: 4.104083776473999). Skipping feature pruning.


InvalidIndexError: (slice(None, None, None), 1)