In [150]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
import sklearn 
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold,cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, classification_report, \
plot_confusion_matrix, roc_curve, auc, accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns
from collections import Counter
import datetime as dt

In [2]:
df = pd.read_csv('./Data/TrainCleaned1.csv')
df2 = pd.read_csv('./Data/TrainCleaned2.csv')

In [3]:
x = df.drop('status_group', axis =1)
y = df.status_group

x2 = df2.drop('status_group', axis =1)
y2 = df2.status_group

In [4]:
class ModelsList():
    def __init__(self, x,y):
        self.xtrain, self.xtest, self.ytrain, self.ytest = train_test_split(x,y,random_state=42)
        self.length = 0
        self.classification_reports = []
        self.cv = []
        self.df = pd.DataFrame({'Model' : pd.Series(dtype='str'), 
                    'train_score' : pd.Series(dtype='float64'), 
                    'test_score': pd.Series(dtype='float64'),
                    'log_loss': pd.Series(dtype='float64'),
                    'params':pd.Series(dtype='O')})
    
    def update(self, estimator, name, fit = True, params = 'None'):
        self.length += 1
        if fit:
            estimator.fit(self.xtrain, self.ytrain)
            
        ypred = estimator.predict(self.xtest)
        log_loss = -cross_val_score(estimator, self.xtrain, self.ytrain, scoring = 'neg_log_loss', n_jobs = -1).mean()
        self.classification_reports.append(classification_report(self.ytest, ypred))
        trainscore = estimator.score(self.xtrain, self.ytrain)
        testscore = estimator.score(self.xtest, self.ytest)
        model_to_add = [name, trainscore, testscore, log_loss, params]
        self.df.loc[len(self.df.index)] = model_to_add
        
        cv = cross_val_score(estimator, self.xtrain, self.ytrain)
        self.cv.append((cv.mean(), cv.std()))
        return self.df
    
    def class_report(self):
        for length in range(self.length):
            print(
            f"""Classification Report for '{self.df.Model[length]}':
                
            """
            )
            print(self.classification_reports[length])
            
    def cv_summary(self):
        for length in range(self.length):
            print(
            f"""Classification Report for '{self.df.Model[length]}':
                {self.cv[length][0]:.5f} ± {self.cv[length][1]:.5f} accuracy
            """
            )
            
    def delete_last(self):
        self.length -= 1
        self.df.drop(self.df.tail(1).index, inplace=True)

In [5]:
model1 = ModelsList(x,y)
model2 = ModelsList(x2,y2)

In [6]:
subpipenum = Pipeline([
    ('num_impute',SimpleImputer(add_indicator=True)),
    ('ss', StandardScaler())
])

subpipecat = Pipeline([
    ('cat_impute', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])

In [7]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num',subpipenum, selector(dtype_include=np.number)),
    ('subpipe_cat', subpipecat, selector(dtype_include=object))
], remainder='passthrough', n_jobs = -1)

In [8]:
dummy = Pipeline([
    ('ct', ct),
    ('dummy', DummyClassifier())
])

In [9]:
%%time
#roughly 30s
model1.update(dummy, 'Dummy')



Wall time: 29.3 s




Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,


In [10]:
%%time
#roughly 20s
model2.update(dummy, 'Dummy')



Wall time: 19 s




Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,


In [11]:
lgr1 = Pipeline([
    ('ct', ct),
    ('lg', LogisticRegression(random_state=42, n_jobs=-1))
])

In [12]:
lgr2 = Pipeline([
    ('ct', ct),
    ('lg', LogisticRegression(random_state=42, n_jobs=-1))
])

In [13]:
%%time
#roughly 50s
model1.update(lgr1, 'LogReg')

Wall time: 44.1 s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,
1,LogReg,0.857508,0.77697,0.569989,


In [14]:
%%time
#roughly 50s
model2.update(lgr2, 'LogReg')

Wall time: 34.1 s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,
1,LogReg,0.809046,0.739327,0.626736,


In [15]:
pd.DataFrame.from_dict(lgr1.get_params(), orient='index').index

Index(['memory', 'steps', 'verbose', 'ct', 'lg', 'ct__n_jobs', 'ct__remainder',
       'ct__sparse_threshold', 'ct__transformer_weights', 'ct__transformers',
       'ct__verbose', 'ct__subpipe_num', 'ct__subpipe_cat',
       'ct__subpipe_num__memory', 'ct__subpipe_num__steps',
       'ct__subpipe_num__verbose', 'ct__subpipe_num__num_impute',
       'ct__subpipe_num__ss', 'ct__subpipe_num__num_impute__add_indicator',
       'ct__subpipe_num__num_impute__copy',
       'ct__subpipe_num__num_impute__fill_value',
       'ct__subpipe_num__num_impute__missing_values',
       'ct__subpipe_num__num_impute__strategy',
       'ct__subpipe_num__num_impute__verbose', 'ct__subpipe_num__ss__copy',
       'ct__subpipe_num__ss__with_mean', 'ct__subpipe_num__ss__with_std',
       'ct__subpipe_cat__memory', 'ct__subpipe_cat__steps',
       'ct__subpipe_cat__verbose', 'ct__subpipe_cat__cat_impute',
       'ct__subpipe_cat__ohe', 'ct__subpipe_cat__cat_impute__add_indicator',
       'ct__subpipe_cat__cat_im

In [16]:
params = {
    'lg__solver' : ['lbfgs','newton-cg', 'saga'],
    'lg__max_iter': [750, 1000, 2000],
    'lg__C' : [0.1, 0.5, 1, 5, 10],
    'lg__tol' : [0.05, 0.01, 0.1],
    'lg__class_weight' : [{'functional': 1, 'non functional': 1, 'functional needs repair': 5},
                          {'functional': 1, 'non functional': 1, 'functional needs repair': 2},
                          {'functional': 1, 'non functional': 1, 'functional needs repair': 1},
                          {'functional': 1, 'non functional': 1, 'functional needs repair': .8}]
}
#324 candidates x 5 folds = 1620 fits

In [22]:
gs1 = GridSearchCV(
    estimator= lgr1,
    param_grid= params,
    cv = 5,
    verbose = 2,
    n_jobs = -1
)

In [23]:
gs2 = GridSearchCV(
    estimator= lgr2,
    param_grid= params,
    cv = 5,
    verbose = 2,
    n_jobs = -1
)

In [24]:
%%time
#ROUGHLY 1h
gs1.fit(model1.xtrain, model1.ytrain)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 47.1min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 54.5min finished


Wall time: 54min 52s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001E7D7A237C0>),
                                                                        ('subpipe_cat',
       

In [25]:
%%time
#ROUGHLY 1h 12:46
gs2.fit(model2.xtrain, model2.ytrain)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 25.6min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed: 29.5min finished


Wall time: 29min 44s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001E7D7A237C0>),
                                                                        ('subpipe_cat',
       

In [67]:
gs1best = gs1.best_estimator_
gs1bestparam = gs1.best_params_
gs1bestparam

{'lg__C': 0.5,
 'lg__class_weight': {'functional': 1,
  'non functional': 1,
  'functional needs repair': 1},
 'lg__max_iter': 750,
 'lg__solver': 'lbfgs',
 'lg__tol': 0.05}

In [66]:
gs2best = gs2.best_estimator_
gs2bestparam = gs2.best_params_
gs2bestparam

{'lg__C': 0.5,
 'lg__class_weight': {'functional': 1,
  'non functional': 1,
  'functional needs repair': 0.8},
 'lg__max_iter': 750,
 'lg__solver': 'lbfgs',
 'lg__tol': 0.05}

In [27]:
%%time
#roughly 1m
model1.update(gs1best, 'Best LogReg', False, gs1bestparam)

Wall time: 1min 39s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,
1,LogReg,0.857508,0.77697,0.569989,
2,Best LogReg,0.880022,0.782828,0.547607,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."


In [28]:
%%time
#roughly 1m
model2.update(gs2best, 'Best LogReg', False, gs2bestparam)

Wall time: 59.6 s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,
1,LogReg,0.809046,0.739327,0.626736,
2,Best LogReg,0.815309,0.746599,0.614127,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."


In [29]:
imbpipe1 = ImPipeline([
    ('ct', ct),
    ('sm', SMOTE(random_state = 42, n_jobs =-1)),
    ('gs_best', LogisticRegression(random_state = 42, 
                                   C = gs1bestparam['lg__C'], 
                                   max_iter = gs1bestparam['lg__max_iter'], 
                                   solver = gs1bestparam['lg__solver'],
                                   tol = gs1bestparam['lg__tol'],
                                   class_weight = gs1bestparam['lg__class_weight'],
                                   n_jobs=-1))
])

In [30]:
imbpipe2 = ImPipeline([
    ('ct', ct),
    ('sm', SMOTE(random_state = 42, n_jobs =-1)),
    ('gs_best', LogisticRegression(random_state = 42, 
                                   C = gs2bestparam['lg__C'], 
                                   max_iter = gs2bestparam['lg__max_iter'], 
                                   solver = gs2bestparam['lg__solver'],
                                   tol = gs2bestparam['lg__tol'],
                                   class_weight = gs2bestparam['lg__class_weight'],
                                   n_jobs=-1))
])

In [31]:
%%time
#roughly 5m
model1.update(imbpipe1, 'Smote with Best LogReg', True, gs1bestparam)

Wall time: 4min 5s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,
1,LogReg,0.857508,0.77697,0.569989,
2,Best LogReg,0.880022,0.782828,0.547607,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.89064,0.754411,0.596532,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."


In [32]:
%%time
#roughly 5m
model2.update(imbpipe2, 'Smote with Best LogReg', True, gs2bestparam)

Wall time: 2min 55s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,
1,LogReg,0.809046,0.739327,0.626736,
2,Best LogReg,0.815309,0.746599,0.614127,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.80963,0.71064,0.675548,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."


In [33]:
pd.DataFrame.from_dict(imbpipe1.get_params(), orient='index').index

Index(['memory', 'steps', 'verbose', 'ct', 'sm', 'gs_best', 'ct__n_jobs',
       'ct__remainder', 'ct__sparse_threshold', 'ct__transformer_weights',
       'ct__transformers', 'ct__verbose', 'ct__subpipe_num', 'ct__subpipe_cat',
       'ct__subpipe_num__memory', 'ct__subpipe_num__steps',
       'ct__subpipe_num__verbose', 'ct__subpipe_num__num_impute',
       'ct__subpipe_num__ss', 'ct__subpipe_num__num_impute__add_indicator',
       'ct__subpipe_num__num_impute__copy',
       'ct__subpipe_num__num_impute__fill_value',
       'ct__subpipe_num__num_impute__missing_values',
       'ct__subpipe_num__num_impute__strategy',
       'ct__subpipe_num__num_impute__verbose', 'ct__subpipe_num__ss__copy',
       'ct__subpipe_num__ss__with_mean', 'ct__subpipe_num__ss__with_std',
       'ct__subpipe_cat__memory', 'ct__subpipe_cat__steps',
       'ct__subpipe_cat__verbose', 'ct__subpipe_cat__cat_impute',
       'ct__subpipe_cat__ohe', 'ct__subpipe_cat__cat_impute__add_indicator',
       'ct__subpipe_

In [34]:
params = {
    'sm__sampling_strategy' : ['minority', 'not majority', 'all'],
    'sm__k_neighbors': [5,10,15]
}
#6 candidiates x 5 folds = 30 fits

In [35]:
smote_gs1 = GridSearchCV(
    estimator= imbpipe1,
    param_grid= params,
    cv =5,
    verbose = 2,
    n_jobs = -1
)

In [36]:
smote_gs2 = GridSearchCV(
    estimator= imbpipe2,
    param_grid= params,
    cv =5,
    verbose = 2,
    n_jobs = -1
)

In [37]:
%%time
#roughly 2hr
smote_gs1.fit(model1.xtrain,model1.ytrain)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  3.0min remaining:   55.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.5min finished


Wall time: 4min 4s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001E7D7A237C0>),
                                                                        ('subpipe_cat',
       

In [38]:
%%time
#roughly 2hr
smote_gs2.fit(model2.xtrain,model2.ytrain)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:  1.8min remaining:   33.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.1min finished


Wall time: 2min 27s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=-1,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001E7D7A237C0>),
                                                                        ('subpipe_cat',
       

In [39]:
smote_gs1best = smote_gs1.best_estimator_
smote_gs2best = smote_gs2.best_estimator_

smote_gs1bestparam = smote_gs1.best_params_
smote_gs2bestparam = smote_gs2.best_params_

In [40]:
%%time
# roughly 1m
model1.update(smote_gs1best, 'Best Smote + LogReg', False, smote_gs1bestparam)

Wall time: 2min 29s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,
1,LogReg,0.857508,0.77697,0.569989,
2,Best LogReg,0.880022,0.782828,0.547607,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.89064,0.754411,0.596532,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
4,Best Smote + LogReg,0.880337,0.753535,0.597584,"{'sm__k_neighbors': 5, 'sm__sampling_strategy'..."


In [41]:
%%time
# roughly 1m
model2.update(smote_gs2best, 'Best Smote + LogReg', False, smote_gs2bestparam)

Wall time: 1min 34s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,
1,LogReg,0.809046,0.739327,0.626736,
2,Best LogReg,0.815309,0.746599,0.614127,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.80963,0.71064,0.675548,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
4,Best Smote + LogReg,0.804422,0.714613,0.675617,"{'sm__k_neighbors': 10, 'sm__sampling_strategy..."


In [42]:
model1.class_report()

Classification Report for 'Dummy':
                
            
                         precision    recall  f1-score   support

             functional       0.55      0.55      0.55      8098
functional needs repair       0.07      0.07      0.07      1074
         non functional       0.38      0.38      0.38      5678

               accuracy                           0.45     14850
              macro avg       0.33      0.33      0.33     14850
           weighted avg       0.45      0.45      0.45     14850

Classification Report for 'LogReg':
                
            
                         precision    recall  f1-score   support

             functional       0.78      0.87      0.82      8098
functional needs repair       0.53      0.28      0.37      1074
         non functional       0.79      0.74      0.77      5678

               accuracy                           0.78     14850
              macro avg       0.70      0.63      0.65     14850
           weighted

In [43]:
model1.cv_summary()

Classification Report for 'Dummy':
                0.44873 ± 0.00399 accuracy
            
Classification Report for 'LogReg':
                0.77241 ± 0.00375 accuracy
            
Classification Report for 'Best LogReg':
                0.77829 ± 0.00321 accuracy
            
Classification Report for 'Smote with Best LogReg':
                0.75010 ± 0.00413 accuracy
            
Classification Report for 'Best Smote + LogReg':
                0.75154 ± 0.00472 accuracy
            


In [98]:
model1.df

Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.447071,0.448283,19.1634,
1,LogReg,0.857508,0.77697,0.569989,
2,Best LogReg,0.880022,0.782828,0.547607,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.89064,0.754411,0.596532,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
4,Best Smote + LogReg,0.880337,0.753535,0.597584,"{'sm__k_neighbors': 5, 'sm__sampling_strategy'..."


In [44]:
model2.class_report()

Classification Report for 'Dummy':
                
            
                         precision    recall  f1-score   support

             functional       0.54      0.54      0.54      8098
functional needs repair       0.06      0.07      0.07      1074
         non functional       0.38      0.38      0.38      5678

               accuracy                           0.45     14850
              macro avg       0.33      0.33      0.33     14850
           weighted avg       0.45      0.45      0.45     14850

Classification Report for 'LogReg':
                
            
                         precision    recall  f1-score   support

             functional       0.75      0.84      0.79      8098
functional needs repair       0.51      0.25      0.34      1074
         non functional       0.74      0.69      0.72      5678

               accuracy                           0.74     14850
              macro avg       0.67      0.59      0.61     14850
           weighted

In [45]:
model2.cv_summary()

Classification Report for 'Dummy':
                0.45138 ± 0.00307 accuracy
            
Classification Report for 'LogReg':
                0.73701 ± 0.00461 accuracy
            
Classification Report for 'Best LogReg':
                0.74018 ± 0.00319 accuracy
            
Classification Report for 'Smote with Best LogReg':
                0.70617 ± 0.00427 accuracy
            
Classification Report for 'Best Smote + LogReg':
                0.70700 ± 0.00468 accuracy
            


In [97]:
model2.df

Unnamed: 0,Model,train_score,test_score,log_loss,params
0,Dummy,0.448238,0.452795,19.103703,
1,LogReg,0.809046,0.739327,0.626736,
2,Best LogReg,0.815309,0.746599,0.614127,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
3,Smote with Best LogReg,0.80963,0.71064,0.675548,"{'lg__C': 0.5, 'lg__class_weight': {'functiona..."
4,Best Smote + LogReg,0.804422,0.714613,0.675617,"{'sm__k_neighbors': 10, 'sm__sampling_strategy..."


In [141]:
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [168]:
def roc_curve_ovr(estimator, xtrain, ytrain):

    y_pred_prob = cross_val_predict(estimator, xtrain, ytrain, method='predict_proba')
    for idx, label in enumerate(np.unique(ytrain)):

        y_binary = (model1.ytrain == label).astype(int)
        fpr, tpr, _ = roc_curve(y_binary, y_pred_prob[:, idx])

        roc_auc = auc(fpr, tpr)

        plt.plot(fpr,tpr, label= f'{label} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Diagonal line for reference
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('ROC Curves for Each Class')
    plt.legend(loc='lower right')
    plt.show()

In [171]:
model1.df.to_csv('./Data/Model1.csv', index=False)
model2.df.to_csv('./Data/Model2.csv', index=False)