## Importing packages and functions

In [None]:
#installing the pycaret module and its sub-modules (only run in GPU mode)
!pip install pycaret[full]

In [None]:
# The magic four
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

# Train Test Split 
from sklearn.model_selection import train_test_split

#Scaler 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Metrics 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score)
from sklearn.metrics import log_loss

# StatsModels & SkLearn
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# garbage collection (clear up some RAM)
import gc

# imputer
from sklearn.impute import SimpleImputer

# Random Forest
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# XGBoost
from xgboost import XGBClassifier

# pycaret (only run in GPU mode)
from pycaret.classification import *

# cuML (only run in GPU mode)
import cudf
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.model_selection import GridSearchCV as cuGridSearchCV
from cuml.model_selection import RandomizedSearchCV as cuRandomizedSearchCV

%matplotlib inline

In [None]:
def aprF1auc(y_real, y_pred):
    '''Function that takes in two columns of a DataFrame and returns metrics scores
    
    Input:
    2 Series or columns of DataFrame
    
    Output:
    5 floats corresponding to each of the metrics calculated
    '''
    accuracy = accuracy_score(y_real, y_pred)
    precision = precision_score(y_real, y_pred)
    recall = recall_score(y_real, y_pred)
    f1 = f1_score(y_real, y_pred)
    auc = roc_auc_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    print(f"AUC:{auc}")
    #return accuracy, precision, recall, f1, auc

In [None]:
## confusion matrix

def produce_confusion(positive_label, negative_label, cut_off, df, y_pred_name, y_real_name):
    '''Function that takes in the necessary information and returns a confusion 
    matrix and various metrics for the data
    
    Input:
    positive_label: string
    negative_label: string
    cut_off: string or float
    df: DataFrame
    y_pred_name: string
    y_real_name: string
    
    Output:
    a confusion matrix
    scores for:
    -accuracy
    -precision
    -recall
    -F1 scores
    -Area under the ROC curve
    '''
    #Set pred to 0 or 1 depending on whether it's higher than the cut_off point.
    
    if cut_off != 'binary':      
        df['pred_binary'] = np.where(df[y_pred_name] > cut_off , 1, 0)
    else: 
        df['pred_binary'] = df[y_pred_name]
    
    #Build the CM
    cm = confusion_matrix(df[y_real_name], df['pred_binary'])  
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g'); 

    # labels, title, ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('Real labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels([negative_label, positive_label])
    ax.yaxis.set_ticklabels([negative_label, positive_label]);
    
    return aprF1auc(df[y_real_name], df['pred_binary'])
    
    #print('Accuracy = ', accuracy_score(df[y_real_name], df['pred_binary']))
    #print('Precision = ', precision_score(df[y_real_name], df['pred_binary']))
    #print('Recall = ', recall_score(df[y_real_name], df['pred_binary']))
    #print('F1 score = ', f1_score(df[y_real_name], df['pred_binary']))
    #print('ROC_AUC score = ', roc_auc_score(df[y_real_name], df['pred_binary']))

In [None]:
#this is an aesthetic choice and just removes the many warnings that some functions and comands produce
#it helps significantly declutter the workbook
import warnings
warnings.filterwarnings('ignore')

# Importing data

In [None]:
#importing data and setting index column
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col='id')

# Exploratory Data Analysis

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
train.dtypes

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
test.shape

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
'''
#correlation heatmap
plt.figure(figsize = (30,30))
corrplot = sns.heatmap(train.corr(), square = True)

corrplot.figure.savefig('corrplot.png')
'''

# Data Cleaning

In [None]:
#defining the train and test datasets
X_train = train.copy()
y_train = X_train.pop('claim')
X_test = test.copy()

#saving the index of the test dataset for later use
idx = X_test.index

## Filling in null values by column mean

In [None]:
#saving a copy of column headings
train_cols = X_train.columns
test_cols = X_test.columns

In [None]:
#fills null value in each column with column mean
SI = SimpleImputer(strategy = 'mean')
X_train_fill = SI.fit_transform(X_train)
X_train_fill = pd.DataFrame(X_train_fill, columns = train_cols)

In [None]:
#fill null values in test and set index from orginal dataset
X_test_fill = SI.fit_transform(X_test)
X_test_fill = pd.DataFrame(X_test_fill, columns = test_cols)
X_test_fill.set_index(idx, inplace = True)

In [None]:
gc.collect()

In [None]:
#adding additional features to both train and test
X_train_fill['n_missing'] = X_train.isnull().sum(axis=1).astype(int)
X_train_fill['std'] = X_train_fill[train_cols].std(axis=1)
X_train_fill['avg'] = X_train_fill[train_cols].mean(axis=1)
X_train_fill['max'] = X_train_fill[train_cols].max(axis=1)
X_train_fill['min'] = X_train_fill[train_cols].min(axis=1)

X_test_fill['n_missing'] = X_test.isnull().sum(axis=1).astype(int) 
X_test_fill['std'] = X_test_fill[test_cols].std(axis=1)
X_test_fill['avg'] = X_test_fill[test_cols].mean(axis=1)
X_test_fill['max'] = X_test_fill[test_cols].max(axis=1)
X_test_fill['min'] = X_test_fill[test_cols].min(axis=1)

In [None]:
#updated list of column headings
train_cols = X_train_fill.columns
test_cols = X_test_fill.columns

# Scaler

In [None]:
#scaling train
scaler = RobustScaler()

scaled_X_train = scaler.fit_transform(X_train_fill)
scaled_X_train = pd.DataFrame(scaled_X_train, columns = train_cols)

In [None]:
#scaling test
scaled_X_test = scaler.transform(X_test_fill)
scaled_X_test = pd.DataFrame(scaled_X_test, columns = test_cols)
scaled_X_test.set_index(idx, inplace = True)

In [None]:
#to reduce datset size in RAM
scaled_X_train = scaled_X_train.astype(np.float32)
scaled_X_test = scaled_X_test.astype(np.float32)

In [None]:
#adding a column to say if the row contains nulls

scaled_X_train['any_missing'] = X_train_fill['n_missing'] > 0
scaled_X_test['any_missing'] = X_test_fill['n_missing'] > 0

scaled_X_train['any_missing'] = scaled_X_train['any_missing'].astype(np.int8)
scaled_X_test['any_missing'] = scaled_X_test['any_missing'].astype(np.int8)

gc.collect()

In [None]:
#eliminate unnecessary objects to reduce RAM usage
try:
    del test, train, scaler, SI, X_train, X_test, X_train_fill, X_test_fill
except:
    print('already dropped!')
finally:
    gc.collect()

# Logistic regression

In [None]:
Logit_X_train = scaled_X_train.copy()
gc.collect()

In [None]:
#defining empty model
def run_model(dfx, dfy):
    y = dfy
    X = dfx
    return sm.Logit(y, X).fit()

In [None]:
#feature engineering 

def feature_eng(df):
    df = df.copy()
    
    df = sm.add_constant(df)
    
    #manually selecting columns to use in the logistic regression
    feature_cols = df.columns
    to_remove = ['f12','f17','f18','f19','f22','f26','f29','f33','f37','f40','f41','f42','f43','f49',
                 'f51','f55','f56','f58','f59','f63','f64','f66','f67','f72','f74','f75','f76',
                 'f80','f84','f85','f88','f94','f101','f103','f115','avg','std']
    feature_cols = sorted(list(set(feature_cols)-set(to_remove)))
    
    df = df[feature_cols]
    
    return df

In [None]:
Logit_X_train = feature_eng(Logit_X_train)
Logit_X_test = feature_eng(scaled_X_test)

In [None]:
#running model
test_0 = run_model(Logit_X_train,y_train)
test_0.summary()

In [None]:
#saving regression output as an image
'''
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 20))
#plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
plt.text(0.01, 0.05, str(test_0.summary()), {'fontsize': 10}, fontproperties = 'monospace') # approach improved by OP -> monospace!
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
'''

In [None]:
#probability values
Logit_X_train['train_pred'] = test_0.predict(Logit_X_train)

#bianry predictions
Logit_X_train['train_pred_bin'] = np.where(Logit_X_train.train_pred > 0.5, 1, 0)

#metrics
aprF1auc(y_train, Logit_X_train.train_pred_bin)

In [None]:
#calculating Log Loss Score
print("Log Loss Score: " + str(log_loss(y_train, Logit_X_train.train_pred_bin)))

In [None]:
gc.collect()

In [None]:
#preparing DataFrame for the metrics function
lr_metric = Logit_X_train.copy()
lr_metric = lr_metric.join(y_train)

In [None]:
#running the metrics function
produce_confusion('Claim','No claim','binary',lr_metric,'train_pred_bin','claim')

# Predict

In [None]:
#probability values
test_pred = test_0.predict(Logit_X_test)

#binary predictions
test_pred_bin = np.where(test_pred > 0.5, 1, 0)
y_test = pd.Series(test_pred_bin)

In [None]:
gc.collect()

# Logistic Regression CV

In [None]:
LRCV_X_train = scaled_X_train.copy()
gc.collect()

In [None]:
LRCV_X_train.dtypes
LRCV_X_train.columns

In [None]:
#crating empty model
simple_regressor = LogisticRegressionCV(cv = 10, n_jobs=-1)

In [None]:
LRCV_cols = [i for i in LRCV_X_train.columns if 'f' in i or 'missing' in i]
LRCV_X_train = LRCV_X_train[LRCV_cols]

In [None]:
#fitting model to train data
simple_regressor.fit(LRCV_X_train,y_train)
gc.collect()

In [None]:
#probability values
LRCV_X_train['predicted_proba'] = simple_regressor.predict_proba(LRCV_X_train)[:,1]

#binary prediction
LRCV_predicted_values = simple_regressor.predict(LRCV_X_train[LRCV_cols])

In [None]:
#calculating Log Loss Score
print("Log Loss Score: " + str(log_loss(y_train, LRCV_predicted_values)))

In [None]:
gc.collect()

In [None]:
aprF1auc(y_train, LRCV_predicted_values)

In [None]:
# preparing for the beautiful metrics function
lrCV_metric = LRCV_X_train.copy()
lrCV_metric = lrCV_metric.join(y_train)

In [None]:
#running the beautiful metrics function
produce_confusion('Claim','No claim','binary',lrCV_metric,'predicted_proba','claim')

In [None]:
#probability values
predicted_proba_test = simple_regressor.predict_proba(scaled_X_test)[:,1]

#binary prediction
predicted_values_test = simple_regressor.predict(scaled_X_test)

# Random Forest

## Random Forest Classifier

In [None]:
#create a copy of the dataset to avoid overwriting issues
try:
    RF_X_train = scaled_X_train.copy()
except:
    print('already done!')
finally:
    gc.collect()

In [None]:
#creating empty classifier
rf = RandomForestClassifier(n_estimators=10, max_depth = 5, n_jobs = -1)

In [None]:
#fitting the RF model on train
rf.fit(RF_X_train, y_train)

In [None]:
# RF classfier score
rf.score(RF_X_train, y_train)

In [None]:
#classfication scores
rf_score = cross_val_score(rf, RF_X_train, y_train, cv=5, n_jobs = -1).mean()
print(f'Random scored {rf_score}')

In [None]:
## Predict on Train
RF_X_train['rf_pred'] = rf.predict(RF_X_train)

## Check Accuracy, Precision, Recall & F1
aprF1auc(RF_X_train['rf_pred'], y_train)

In [None]:
#calculating Log Loss Score
print("Log Loss Score: " + str(log_loss(y_train, RF_X_train['rf_pred'])))

In [None]:
# preparing for the beautiful metrics function
rf_metric = RF_X_train.copy()
rf_metric = rf_metric.join(y_train)

In [None]:
#running the beautiful metrics function
produce_confusion('Claim','No claim','binary',rf_metric,'rf_pred','claim')

In [None]:
gc.collect()

## Extra Trees Classifier

In [None]:
#create a copy to avoid overwriting issues
try:
    ET_X_train = scaled_X_train.copy()
    #del scaled_X_train
except:
    print('already done!')
finally:
    gc.collect()

In [None]:
et = ExtraTreesClassifier(n_estimators=10, max_depth = 5, n_jobs = -1)

In [None]:
#fitting the ET model on train
et.fit(ET_X_train, y_train)

In [None]:
# ET classfier score
et.score(ET_X_train, y_train)

In [None]:
et_score = cross_val_score(et, ET_X_train, y_train, cv=5, n_jobs = -1).mean()
print(f'Extra Random scored {et_score}')

In [None]:
## Predict on Train
ET_X_train['et_pred'] = et.predict(ET_X_train)

## Check Accuracy, Precision, Recall & F1
aprF1auc(ET_X_train['et_pred'], y_train)

In [None]:
#calculating Log Loss Score
print("Log Loss Score: " + str(log_loss(y_train, ET_X_train['et_pred'])))

In [None]:
# preparing for the beautiful metrics function
et_metric = ET_X_train.copy()
et_metric = et_metric.join(y_train)

In [None]:
#running the beautiful metrics function
produce_confusion('Claim','No claim','binary',et_metric,'et_pred','claim')

In [None]:
gc.collect()

## Test

In [None]:
scaled_X_test

In [None]:
## Predict on Test

rf_test_pred = rf.predict(scaled_X_test)
rf_test_pred

In [None]:
## Predict on Test

et_test_pred = et.predict(scaled_X_test)
pd.Series(et_test_pred)

# GridSearch

In [None]:
#create a copy to avoid overwriting issues
GS_X_train = scaled_X_train.copy()
#del scaled_X_train
gc.collect()

In [None]:
## running GridSearch and RandomizedSearch using ExtraTrees (cause it's much quicker)
et_params = {
    'n_estimators': [5, 10, 20, 50],
    'max_depth': [2, 3, 4, 5, 6],
    'min_samples_split' : [10, 100, 1000, 10000, 100000],
    'min_samples_leaf' : [1, 10, 100, 1000, 10000, 100000]
}
et = ExtraTreesClassifier()
#gs = GridSearchCV(et, param_grid=et_params, cv=5, verbose = 1, n_jobs = -1, scoring = ‘roc_auc’)
rs = RandomizedSearchCV(et, param_distributions=et_params, 
                        n_iter = 100, cv=5, refit = 'roc_auc',
                        verbose = 1, n_jobs = -1,
                        scoring = ['roc_auc','accuracy'])

In [None]:
#fitting and identification of best classifier

#GridSearch
#gs.fit(GS_X_train, y_train)
#print(gs.best_score_)
#gs.best_params_

#RandomizedSearch
rs.fit(GS_X_train, y_train)
print(rs.best_score_)
rs.best_params_

In [None]:
gc.collect()

In [None]:
#classfier score
rs.score(GS_X_train, y_train)

In [None]:
## Predict on Train
GS_X_train['gs_pred'] = rs.predict(GS_X_train)

## Check Accuracy, Precision, Recall & F1
aprF1auc(GS_X_train['gs_pred'], y_train)

In [None]:
print("Root Mean Squared Error: " + str(mean_squared_error(GS_X_train['gs_pred'], y_train, squared = False)))

In [None]:
# preparing for the beautiful metrics function
gs_metric = GS_X_train.copy()
gs_metric = gs_metric.join(y_train)

In [None]:
#running the beautiful metrics function
produce_confusion('Claim', 'No claim', 'binary', gs_metric, 'gs_pred', 'claim')

## cuML Random Forest (GPU acceleration)

In [None]:
#create a copy to avoid overwriting issues
cuRF_X_train = scaled_X_train.copy()
#del scaled_X_train
gc.collect()

In [None]:
## creating empty GridSearch using cuML Random Forest Classifier (GPU accelerated)
cu_RF_params = {
    'n_estimators': [5, 10, 20],
    'max_depth': [2, 3, 4, 5, 6],
    'min_samples_split' : [10, 100, 1000, 10000],
    'min_samples_leaf' : [1, 10, 100, 1000, 10000]
}
cu_RF = cuRF()
gs = cuGridSearchCV(cu_RF, param_grid=cu_RF_params,
                    scoring = ['roc_auc','accuracy'],
                    refit = 'roc_auc',cv=5, 
                    verbose = 1, 
                    n_jobs = -1)  

In [None]:
#fitting and identification of best classifier
gs.fit(cuRF_X_train, y_train)

print(gs.best_score_)

gs.best_params_

# SVM (needs GPU)

In [None]:
#give it a shot even though it needs GPU acceleration

In [None]:
from cuml.svm import SVC
SVM_X_train = scaled_X_train.copy()
SVM_X_train.drop(columns = 'any_missing', inplace = True)
gc.collect()

In [None]:
train = SVC(gamma='auto'
              , kernel='rbf'
              , C=10
              , probability = True)

train.fit(SVM_X_train, y_train)

## GridSearch

In [None]:
# choosing list of parameter values
GS_tuned_parameters = [{'kernel': ['rbf'], 'gamma': ['auto'],'C': [0.1, 1, 10, 100]}]

In [None]:
#creating GridSearch
GS_train = cuGridSearchCV(SVC(), GS_tuned_parameters, scoring='roc_auc', verbose = 1)

In [None]:
gc.collect()

In [None]:
#fitting all combinations of parameters
GS_train.fit(scaled_X_train, y_train)

In [None]:
#best model parameters
GS_train.best_params_

In [None]:
#train scores
print(classification_report(y_train, GS_train.predict(scaled_X_train)))

# XGBoost (needs GPU)

In [None]:
XGB_X_train = scaled_X_train.copy()
gc.collect()

In [None]:
##this needs GPU acceleration

xgb = XGBClassifier(tree_method='gpu_hist', n_jobs = -1)
#xgb = XGBClassifier(n_jobs = -1)

In [None]:
gc.collect()
xgb.fit(XGB_X_train, y_train)

In [None]:
#classfier score
gc.collect()
xgb.score(XGB_X_train, y_train)

In [None]:
## Predict on Train
XGB_X_train['xgb_pred'] = xgb.predict(XGB_X_train)

In [None]:
print("Root Mean Squared Error: " + str(mean_squared_error(XGB_X_train['xgb_pred'], y_train, squared = False)))

In [None]:
# preparing for the beautiful metrics function
xgb_metric = XGB_X_train.copy()
xgb_metric = xgb_metric.join(y_train)

In [None]:
#running the beautiful metrics function
produce_confusion('Claim', 'No claim', 'binary', xgb_metric, 'xgb_pred', 'claim')

In [None]:
test_predict = xgb.predict(scaled_X_test)

# PyCaret Classification

In [None]:
# pycaret wants a single dataframe which includes the target column
try:
    clf_data = scaled_X_train.copy()
    clf_data = clf_data.join(y_train)
    del scaled_X_train
except:
    pass
finally:
    gc.collect()

## Setup

In [None]:
#setting up the pipeline
clf = setup(data = clf_data, 
            target = 'claim',
            data_split_stratify = True,
            use_gpu = True, 
            n_jobs = -1, 
            silent = True
           )

In [None]:
gc.collect()
lr = create_model('lr')

In [None]:
predict_model(lr);

In [None]:
final_lr = finalize_model(lr)
predict_model(final_lr);

In [None]:
#this lists all models that can be run
models()

## Model comparison

In [None]:
#comparing all models
#excluded some models that did not benefit from GPU acceleration and took too long to run (ada, gbc)
#excluded SVM since it does not support AUC score
#excluded some models that were run independently before (dt, et)
compare_models(exclude = ['dt','ada','gbc','et','svm'], sort = 'AUC')

## CatBoost

### Model creation

In [None]:
#creating a model
catboost = create_model('catboost')
gc.collect()

### Hyperparameter tuning

In [None]:
gc.collect()

In [None]:
#hyperparameter tuning
gc.collect()
tuned_catboost = tune_model(catboost, choose_better = True, optimize = 'AUC')

### Model plots

In [None]:
#model plotting
plot_model(tuned_catboost, plot = 'auc')

In [None]:
plot_model(tuned_catboost, plot = 'pr')

In [None]:
plot_model(tuned_catboost, plot='feature')

In [None]:
plot_model(tuned_catboost, plot = 'confusion_matrix')

In [None]:
gc.collect()

### Prediction on internal validation set

In [None]:
#predict on validation set
predict_model(tuned_catboost);

### Finalise model

In [None]:
#finalise model
#fits the model onto the complete dataset including the test/hold-out sample
final_catboost = finalize_model(tuned_catboost)

In [None]:
#predict using entire dataset
predict_model(final_catboost);

### Predict on unseen data (X_test)

In [None]:
#prediction on unseen data
unseen_predictions_cat = predict_model(final_catboost, data=scaled_X_test, raw_score = True)
unseen_predictions_cat.head()

### Save model

In [None]:
save_model(final_catboost,'Final CatBoost Model 21Sep2021')

## LightGBM

### Model creation

In [None]:
#creating a model
lightgbm = create_model('lightgbm')
gc.collect()

### Hyperparameter tuning

In [None]:
#hyperparameter tuning
gc.collect()
tuned_lightgbm = tune_model(lightgbm, search_library = 'scikit-optimize', early_stopping = 'asha')

### Model plots

In [None]:
#model plotting
plot_model(tuned_lightgbm, plot = 'auc')

In [None]:
plot_model(tuned_lightgbm, plot = 'pr')

In [None]:
plot_model(tuned_lightgbm, plot='feature')

In [None]:
plot_model(tuned_lightgbm, plot = 'confusion_matrix')

In [None]:
gc.collect()

### Prediction on internal validation set

In [None]:
#predict on validation set
predict_model(tuned_lightgbm);

### Finalise model

In [None]:
#finalise model
#fits the model onto the complete dataset including the test/hold-out sample
final_lightgbm = finalize_model(tuned_lightgbm)

In [None]:
#predict using entire dataset
predict_model(final_lightgbm);

### Predict on unseen data (X_test)

In [None]:
#performance on unseen data
unseen_predictions_lgbm = predict_model(final_lightgbm, data=scaled_X_test)
unseen_predictions_lgbm.head()

### Save model

In [None]:
save_model(final_lightgbm,'Final LightGBM Model 17Sep2021')

## SVM

In [None]:
gc.collect()

In [None]:
svm = create_model('svm')
#predict_model(svm);

final_svm = finalize_model(svm)
predict_model(final_svm);

## Ensemble model

In [None]:
meow = create_model('catboost')

In [None]:
tuned_meow = tune_model(meow, choose_better = True, optimize = 'AUC')

In [None]:
predict_model(meow);

In [None]:
cats = ensemble_model(meow, choose_better = True, optimize = 'AUC')

In [None]:
predict_model(cats);

In [None]:
final_model = finalize_model(cats)
predict_model(final_model);

In [None]:
#performance on unseen data
unseen_predictions_ensemble = predict_model(final_model, data = scaled_X_test, raw_score = True)
unseen_predictions_ensemble.head()

## AutoML

In [None]:
top3 = compare_models(n_select = 3, exclude = ['dt','ada','gbc','et','svm'], sort = 'AUC')

In [None]:
tuned_top3 = [tune_model(i, choose_better = True, optimize = 'AUC') for i in top3]

In [None]:
#voting classifier
tuned_blend = blend_models(tuned_top3)

In [None]:
basic_blend = blend_models(top3)

In [None]:
best_auc_model = automl(optimize = 'AUC')

In [None]:
predict_model(basic_blend);

In [None]:
final_model = finalize_model(basic_blend)
predict_model(final_model);

In [None]:
#performance on unseen data
unseen_predictions_best = predict_model(final_model, data = scaled_X_test, raw_score = True)
unseen_predictions_best.head()

In [None]:
top3

In [None]:
#model plotting
plot_model(final_model, plot = 'auc')

In [None]:
plot_model(final_model, plot = 'pr')

In [None]:
plot_model(final_model, plot='feature')

In [None]:
plot_model(final_model, plot = 'confusion_matrix')

In [None]:
save_model(final_model, '3top23Sept2021')

In [None]:
predict_model(basic_blend);

In [None]:
basic_blend_final = finalize_model(basic_blend)
predict_model(basic_blend_final);

In [None]:
unseen_predictions_best = predict_model(basic_blend_final, data = scaled_X_test, raw_score = True)
unseen_predictions_best.head()

In [None]:
plot_model(basic_blend_final, plot = 'auc')

In [None]:
plot_model(basic_blend_final, plot = 'confusion_matrix')

In [None]:
plot_model(basic_blend_final, plot = 'feature')

In [None]:
plot_model(basic_blend_final, plot = 'pr')

# Submission

In [None]:
assert(len(idx)==len(unseen_predictions_best))

sub = pd.DataFrame(list(zip(idx, unseen_predictions_best.Score_1)),columns = ['id', 'claim'])

sub.to_csv('submission.csv', index = False)

print(sub)