In [68]:
import warnings
warnings.filterwarnings('ignore')

import ast
from autogluon.tabular import TabularPredictor
from category_encoders import TargetEncoder
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMClassifier
import numpy as np

import os
import pandas as pd
from pprint import pprint

import random

from sklearn.feature_selection import mutual_info_classif, RFECV
from sklearn.inspection import permutation_importance
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline

from scipy import stats

import time
from tqdm.notebook import tqdm

pd.set_option('display.max_columns', None)

experiment_name = 'baseline'

In [2]:
train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

train.shape, test.shape

((11504798, 12), (7669866, 11))

In [3]:
train.sample(3)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
4428844,4428844,Male,38,1,17.0,0,1-2 Year,Yes,24573.0,124.0,117,1
7344254,7344254,Female,38,1,37.0,0,1-2 Year,Yes,2630.0,157.0,260,0
4271212,4271212,Male,55,1,28.0,1,1-2 Year,No,39711.0,26.0,78,0


In [4]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Annual_Premium', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
needs_dummies = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [5]:
ohe = pd.get_dummies(train, columns=needs_dummies, drop_first=True, dtype='int')
ohe_test = pd.get_dummies(test, columns=needs_dummies, drop_first=True, dtype='int')
ohe.head()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender_Male,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
0,0,21,1,35.0,0,65101.0,124.0,187,0,1,0,0,1
1,1,43,1,28.0,0,58911.0,26.0,288,1,1,0,1,1
2,2,25,1,14.0,1,38043.0,152.0,254,0,0,1,0,0
3,3,35,1,1.0,0,2630.0,156.0,76,0,0,0,0,1
4,4,36,1,15.0,1,31951.0,152.0,294,0,0,0,0,0


In [6]:
# Drop the features not required
ohe_drop = ohe.drop(drop_cols, axis=1)
ohe_drop_test = ohe_test.drop(drop_cols, axis=1)

In [7]:
ohe_drop.shape, ohe_drop_test.shape

((11504798, 12), (7669866, 11))

In [8]:
ohe_drop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Age                    int64  
 1   Driving_License        int64  
 2   Region_Code            float64
 3   Previously_Insured     int64  
 4   Annual_Premium         float64
 5   Policy_Sales_Channel   float64
 6   Vintage                int64  
 7   Response               int64  
 8   Gender_Male            int32  
 9   Vehicle_Age_< 1 Year   int32  
 10  Vehicle_Age_> 2 Years  int32  
 11  Vehicle_Damage_Yes     int32  
dtypes: float64(3), int32(4), int64(5)
memory usage: 877.7 MB


In [9]:
# Initialize ShuffleSplit
# Get 5% of the data as the training data and rest as test
sss = StratifiedShuffleSplit(test_size=0.05, random_state=5)

# Get indices for the split
# Stratification is done on target variable
for train_index, test_index in sss.split(ohe_drop, ohe_drop[TARGET]):
    train_data = ohe_drop.iloc[test_index]
    test_data = ohe_drop.iloc[train_index]

train_data.shape, test_data.shape

# 1 minute

((575240, 12), (10929558, 12))

In [10]:
# Assign X and y
X = train_data.drop(TARGET, axis=1)
y = train_data[TARGET]

val_X = test_data.drop(TARGET, axis=1)
val_y = test_data[TARGET]

test_X = ohe_drop_test.copy()

X.shape, y.shape, val_X.shape, val_y.shape, test_X.shape

((575240, 11), (575240,), (10929558, 11), (10929558,), (7669866, 11))

In [11]:
n_splits = 10

sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [12]:
# Define classification models
classif_models = [
    LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc'),
]

In [13]:
def classif_evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    Model_compare = pd.DataFrame(columns=['Model Name', 
                                        'Model Parameters', 
                                        'Model Train ROC AUC', 
                                        'Model Test ROC AUC', 
                                        'Model Test ROC AUC Std', 
                                        'Model Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            model_name = alg.name
        else:
            model_name = alg.__class__.__name__
        features = important_features.get(model_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {model_name} due to no important features.')
            return {
                'Model Name': model_name,
                'Model Parameters': str(alg.get_params()),
                'Model Train ROC AUC': 0,
                'Model Test ROC AUC': 0,
                'Model Test ROC AUC Std': 0,
                'Model Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'Model Name': model_name,
            'Model Parameters': str(alg.get_params()),
            'Model Train ROC AUC': cv_results['train_score'].mean(),
            'Model Test ROC AUC': cv_results['test_score'].mean(),
            'Model Test ROC AUC Std': cv_results['test_score'].std(),
            'Model Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {model_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    model_compare = pd.DataFrame(results_list)

    model_compare.sort_values(by=['Model Test ROC AUC'], ascending=False, inplace=True)
    model_compare.to_csv(f'results\{experiment_name}.csv', index=False)

    return model_compare

In [14]:
baseline_features_classif = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features_classif[model_name] = list(X.columns)

In [15]:
%%time

baseline_models_classif = classif_evaluate_models(classif_models, X, y, baseline_features_classif, sk10, f'{experiment_name}')
baseline_models_classif

# 1 minute

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 281 ms
Wall time: 40.1 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.875302,0.872437,0.001456,0 min 13.13 sec


### Mutual Information

In [42]:
# To ensure the same randomness everytime
np.random.seed(5)

X_mi = X.copy()

# Add random features
X_mi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X_mi['random_feature_categorical'] = np.random.randint(1, 8, X.shape[0])
X_mi.shape

(575240, 13)

In [43]:
# Initialize parameters
random_states = [5, 42, 100, 500]
n_neighbors_list = [3, 5, 7, 10, 20]
results = defaultdict(list)

In [48]:
X_and_y = pd.concat([X_mi, y], axis=1)
X_and_y.shape

(575240, 14)

In [49]:
# Calculate MI for each combination of random_state and n_neighbors
for random_state in random_states:
    for n_neighbors in n_neighbors_list:        
        # Calculate MI
        mi = mutual_info_classif(X_and_y, y, n_neighbors=n_neighbors, random_state=random_state)
        
        # Store results if the target has the highest MI score
        mi_dict = dict(zip(X_and_y.columns, mi))
        if mi_dict[TARGET] == max(mi_dict.values()):
            for feature, score in mi_dict.items():
                results[feature].append(score)

        print(f'Done with Random State - {random_state} and N Neighbors - {n_neighbors}')

Done with Random State - 5 and N Neighbors - 3
Done with Random State - 5 and N Neighbors - 5
Done with Random State - 5 and N Neighbors - 7
Done with Random State - 5 and N Neighbors - 10
Done with Random State - 5 and N Neighbors - 20
Done with Random State - 42 and N Neighbors - 3
Done with Random State - 42 and N Neighbors - 5
Done with Random State - 42 and N Neighbors - 7
Done with Random State - 42 and N Neighbors - 10
Done with Random State - 42 and N Neighbors - 20
Done with Random State - 100 and N Neighbors - 3
Done with Random State - 100 and N Neighbors - 5
Done with Random State - 100 and N Neighbors - 7
Done with Random State - 100 and N Neighbors - 10
Done with Random State - 100 and N Neighbors - 20
Done with Random State - 500 and N Neighbors - 3
Done with Random State - 500 and N Neighbors - 5
Done with Random State - 500 and N Neighbors - 7
Done with Random State - 500 and N Neighbors - 10
Done with Random State - 500 and N Neighbors - 20


In [50]:
# Average MI scores across valid combinations
average_mi = {feature: np.mean(scores) for feature, scores in results.items() if scores}
average_mi

{'Age': 0.034032485249745406,
 'Driving_License': 0.04775086363187272,
 'Region_Code': 0.015439309607458545,
 'Previously_Insured': 0.09483027811047715,
 'Annual_Premium': 0.026824669302801373,
 'Policy_Sales_Channel': 0.05523408957309643,
 'Vintage': 0.013713151325547462,
 'Response': 0.3737521932232421,
 'Gender_Male': 0.015030648357397609,
 'Vehicle_Age_< 1 Year': 0.03723742899271379,
 'Vehicle_Age_> 2 Years': 0.004823074203718708,
 'Vehicle_Damage_Yes': 0.09153486381832095,
 'random_feature_continous': 1.1862574026544337e-05,
 'random_feature_categorical': 0.005912755934634073}

In [51]:
# Display results
sorted_mi = sorted(average_mi.items(), key=lambda x: x[1], reverse=True)
print("Average MI scores:", sorted_mi)

Average MI scores: [('Response', 0.3737521932232421), ('Previously_Insured', 0.09483027811047715), ('Vehicle_Damage_Yes', 0.09153486381832095), ('Policy_Sales_Channel', 0.05523408957309643), ('Driving_License', 0.04775086363187272), ('Vehicle_Age_< 1 Year', 0.03723742899271379), ('Age', 0.034032485249745406), ('Annual_Premium', 0.026824669302801373), ('Region_Code', 0.015439309607458545), ('Gender_Male', 0.015030648357397609), ('Vintage', 0.013713151325547462), ('random_feature_categorical', 0.005912755934634073), ('Vehicle_Age_> 2 Years', 0.004823074203718708), ('random_feature_continous', 1.1862574026544337e-05)]


In [52]:
with open('mutual_info_scores_classif.txt', mode='w') as f:
    pprint(sorted_mi, stream=f)

In [53]:
# Determine higher MI between 0 and random_feature
higher_threshold = max(0, average_mi.get('random_feature_categorical', 0), average_mi.get('random_feature_continous', 0))
higher_threshold

0.005912755934634073

In [54]:
# List features with MI higher than the threshold, excluding the target
mi_features_list = [feature for feature, score in sorted_mi if feature != TARGET and score > higher_threshold]
mi_features_list

['Previously_Insured',
 'Vehicle_Damage_Yes',
 'Policy_Sales_Channel',
 'Driving_License',
 'Vehicle_Age_< 1 Year',
 'Age',
 'Annual_Premium',
 'Region_Code',
 'Gender_Male',
 'Vintage']

In [55]:
mi_features_reg = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    mi_features_reg[model_name] = mi_features_list

In [56]:
%%time

mi_models_classif = classif_evaluate_models(classif_models, X, y, mi_features_reg, sk10, f'{experiment_name}')
mi_models_classif

# 1 minute

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 156 ms
Wall time: 34.2 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.873752,0.870728,0.001579,0 min 10.29 sec


### Permutation Importance

In [65]:
# To ensure the same randomness everytime
np.random.seed(5)

X_pi = X.copy()

# Add random features
X_pi['random_feature_continous'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X_pi['random_feature_categorical'] = np.random.randint(1, 8, X.shape[0])
X_pi.shape

(575240, 13)

In [66]:
%%time

perm_importances = {model.name if hasattr(model, 'name') else model.__class__.__name__: [] for model in classif_models}

for i, (train_idx, test_idx) in enumerate(sk10.split(X_pi, y)):
    X_train, X_test = X_pi.iloc[train_idx], X_pi.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model in classif_models:
        if hasattr(model, 'name'):
            model_name = model.name
        else:
            model_name = model.__class__.__name__

        model.fit(X_train, y_train)
        # Calculate permutation importance
        result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=5, scoring='roc_auc')
        perm_importances[model_name].append(result.importances_mean)
        print(f'Done with {model_name}.')
    
    print(f'Done with Fold {i+1}', end='\n\n')

Done with LGBMClassifier.
Done with Fold 1

Done with LGBMClassifier.
Done with Fold 2

Done with LGBMClassifier.
Done with Fold 3

Done with LGBMClassifier.
Done with Fold 4

Done with LGBMClassifier.
Done with Fold 5

Done with LGBMClassifier.
Done with Fold 6

Done with LGBMClassifier.
Done with Fold 7

Done with LGBMClassifier.
Done with Fold 8

Done with LGBMClassifier.
Done with Fold 9

Done with LGBMClassifier.
Done with Fold 10

CPU times: total: 6min 29s
Wall time: 3min 25s


In [67]:
%%time

# Average importances across folds and export to CSV
for model_name, importances in perm_importances.items():
    avg_importance = np.mean(importances, axis=0)
    importance_df = pd.DataFrame({'Feature': X_pi.columns, 'Importance': avg_importance})
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    
    # Export to CSV
    importance_df.to_csv(f'.\permutation_importances\{model_name}_permutation_importance.csv', index=False)

print('Done with Permuation Importances', end='\n\n')

Done with Permuation Importances

CPU times: total: 0 ns
Wall time: 16 ms


In [69]:
directory = 'permutation_importances'

# Initialize a dictionary for the features
perm_important_features = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__
    print(f'Model: {model_name}')

    csv_path = os.path.join(directory, f'{model_name}_permutation_importance.csv')
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Initialize importance variables
        random_feature_importance_cont = 0
        random_feature_importance_cat = 0
        
        # Check for 'random_feature_continous' and its importance
        if 'random_feature_continous' in df['Feature'].values:
            random_feature_importance_cont = df.loc[df['Feature'] == 'random_feature_continous', 'Importance'].iloc[0]
            print(random_feature_importance_cont)
        if 'random_feature_categorical' in df['Feature'].values:
            random_feature_importance_cat = df.loc[df['Feature'] == 'random_feature_categorical', 'Importance'].iloc[0]
            print(random_feature_importance_cat)
        else:
            random_feature_importance = 0

        # Determine the threshold
        threshold = max(0, random_feature_importance_cont, random_feature_importance_cat)
        print(f'Threshold: {threshold}')

        # Filter features where importance is greater than 0
        important_feats_filtered = df[df['Importance'] > threshold]['Feature'].tolist()

        # # Reorder important_feats based on the predefined features_list
        # important_feats_ordered = [feat for feat in features_list if feat in important_feats_filtered]

        # Add to importance dictionary
        perm_important_features[model_name] = important_feats_filtered

    else:
        print(f'CSV file for {model_name} not found.')

print('Done getting important features dictionary')

Model: LGBMClassifier
-3.4111162701040424e-05
-2.8302888270164144e-05
Threshold: 0
Done getting important features dictionary


In [70]:
with open('perm_important_features.txt', mode='w') as f:
    pprint(perm_important_features, stream=f)

In [71]:
%%time

pi_models_classif = classif_evaluate_models(classif_models, X, y, perm_important_features, sk10, f'{experiment_name}_pi')
pi_models_classif

# 1 minute

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 234 ms
Wall time: 32 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.875291,0.872451,0.001403,0 min 9.99 sec


### RFECV

In [75]:
%%time

# Initialize empty dictionary for RFECV features
rfecv_features = {}

for model in classif_models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__
		
    features = baseline_features_classif[model_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {model_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(model, cv=sk10, step=1, scoring='roc_auc', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        rfecv_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')
    
    except ValueError:
        # In case of an error, keep the original order but filtered by features_list
        features_filtered = [feat for feat in features_list if feat in features]
        rfecv_features[model_name] = features_filtered
        print(f'{model_name} does not have coef_ or feature_importances_', end='\n\n')

# 5 minutes

Starting with LGBMClassifier
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator wi

In [76]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

In [77]:
%%time

rfecv_models_classif = classif_evaluate_models(classif_models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv')
rfecv_models_classif

# 1 minute

Models:   0%|          | 0/1 [00:00<?, ?it/s]

Progress:   0%|          | 0/1 [00:00<?, ?it/s]

Done with LGBMClassifier.
CPU times: total: 266 ms
Wall time: 35.8 s


Unnamed: 0,Model Name,Model Parameters,Model Train ROC AUC,Model Test ROC AUC,Model Test ROC AUC Std,Model Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.875302,0.872437,0.001456,0 min 11.10 sec


### LGBM Validation

*Proven to be close to the PL on Kaggle*

In [78]:
model = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc')

model.fit(X[rfecv_features['LGBMClassifier']], y)

In [79]:
val_pred = model.predict_proba(val_X[rfecv_features['LGBMClassifier']])[:, 1]
val_score = roc_auc_score(val_y, val_pred)
val_score

0.8721405023726595

### LGBM Submission Prediction

In [59]:
model = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary', metric='auc')

model.fit(ohe_drop.drop(TARGET, axis=1)[perm_important_features['LGBMClassifier']], ohe_drop[TARGET])

In [60]:
pred = model.predict_proba(test_X[perm_important_features['LGBMClassifier']])[:, 1]
pred_df = pd.DataFrame(pred, columns=[TARGET])

In [61]:
pred_df.head()

Unnamed: 0,Response
0,0.014201
1,0.364148
2,0.261249
3,0.000212
4,0.03533


In [62]:
submission = pd.concat([test['id'], pred_df[TARGET]], axis=1)

In [63]:
submission.tail()

Unnamed: 0,id,Response
7669861,19174659,0.203239
7669862,19174660,0.000288
7669863,19174661,0.000448
7669864,19174662,0.545714
7669865,19174663,0.000227


In [64]:
# Check winning_route.txt for what the result steps are
submission.to_csv(r'submissions/experiment_2_lgbm.csv', index=False)

### Autogluon

In [31]:
# Define the different feature types and target
cat_cols = ['Vehicle_Age']
num_cols = ['Age', 'Annual_Premium', 'Vintage']
ord_cols = []
bin_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
needs_dummies = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
drop_cols = ['id']
TARGET = 'Response'

In [32]:
# Initialize ShuffleSplit
# Get 5% of the data as the training data and rest as test
sss = StratifiedShuffleSplit(test_size=0.05, random_state=5)

# Get indices for the split
# Stratification is done on target variable
for train_index, test_index in sss.split(train, train[TARGET]):
    autogluon_train_data = train.iloc[test_index]
    # test_data = ohe_drop.iloc[train_index]

train_data.shape
# 1 minute

(575240, 12)

In [37]:
# Drop Unneeded column
autogluon_train_data = autogluon_train_data.drop(drop_cols, axis=1)
autogluon_train_data.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
2698469,Male,53,1,28.0,1,1-2 Year,No,43134.0,26.0,107,0
7618126,Male,52,1,28.0,0,1-2 Year,Yes,44346.0,26.0,258,0
10678,Male,22,1,28.0,0,< 1 Year,Yes,27173.0,152.0,271,0
10177923,Female,42,1,28.0,0,1-2 Year,Yes,52740.0,26.0,98,0
5137129,Male,27,1,6.0,1,< 1 Year,No,26127.0,152.0,254,0


In [38]:
# Force convert to categorical
for col in cat_cols + bin_cols:
    autogluon_train_data[col] = autogluon_train_data[col].astype('category')

In [39]:
autogluon_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 575240 entries, 2698469 to 4633953
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   Gender                575240 non-null  category
 1   Age                   575240 non-null  int64   
 2   Driving_License       575240 non-null  category
 3   Region_Code           575240 non-null  category
 4   Previously_Insured    575240 non-null  category
 5   Vehicle_Age           575240 non-null  category
 6   Vehicle_Damage        575240 non-null  category
 7   Annual_Premium        575240 non-null  float64 
 8   Policy_Sales_Channel  575240 non-null  category
 9   Vintage               575240 non-null  int64   
 10  Response              575240 non-null  int64   
dtypes: category(7), float64(1), int64(3)
memory usage: 26.3 MB


In [40]:
TIME_LIMIT = 60

def delete_autogluon_file():
    directory = 'AutogluonModels'
    filelist = [f for f in os.listdir(directory)]
    for file in filelist:
        file_path = os.path.join(directory, file)
        # Check if the file is a regular file
        if os.path.isfile(file_path):
            # Delete the file
            os.remove(file_path)
        # Check if the file is a directory
        elif os.path.isdir(file_path):
            # Delete the directory and its contents recursively
            shutil.rmtree(file_path)

In [25]:
missing_columns = set(X.columns) - set(test_X.columns)
missing_columns

set()

In [27]:
ohe_drop.shape, ohe_drop_test.shape, train_data.shape, test_data.shape

((11504798, 12), (7669866, 11), (575240, 12), (10929558, 12))

In [41]:
autogluon_roc_scores = []

for fold, (train_index, test_index) in enumerate(sk10.split(X, y)):
    # Split the dataset into train and test sets
    auto_train_data = autogluon_train_data.iloc[train_index]
    auto_test_data = autogluon_train_data.iloc[test_index]

    # Print the shapes of train and test data for debugging
    print(f'Fold {fold + 1} - Train data shape: {auto_train_data.shape}, Test data shape: {auto_test_data.shape}')

    predictor = TabularPredictor(problem_type='binary', 
                                 label=TARGET, 
                                 eval_metric='roc_auc', 
                                 verbosity=1)

    predictor.fit(train_data=auto_train_data,
                  presets='medium_quality',
                  time_limit=TIME_LIMIT,
                  # num_bag_folds=5, 
                  # num_bag_sets=1, 
                  # num_stack_levels=3,
                  feature_prune_kwargs={'force_prune': True}
    )

    # Get the prediction
    performance = predictor.predict_proba(auto_train_data.drop(TARGET, axis=1))[:, 1]

    # Calculate roc
    roc = roc_auc_score(auto_test_data[TARGET], performance)

    print(f'Autogluon Fold {fold + 1} - ROC: {roc}')
    print()

    autogluon_roc_scores.append(roc)

    # Delete the models because of memory
    delete_autogluon_file()

# Print the ROC AUC scores for each fold
print('Autogluon ROC Mean:', np.mean(autogluon_roc_scores))
print('Autogluon ROC STD:', np.std(autogluon_roc_scores))

# Autogluon Fold 1 - roc: 
# Autogluon Fold 2 - roc: 
# Autogluon Fold 3 - roc: 

No path specified. Models will be saved in: "AutogluonModels\ag-20240716_172437\"


Fold 1 - Train data shape: (517716, 11), Test data shape: (57524, 11)


Insufficient time to train even a single feature pruning model (remaining: 0, needed: 4.104083776473999). Skipping feature pruning.


InvalidIndexError: (slice(None, None, None), 1)