In [34]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, recall_score, \
    accuracy_score, precision_score, f1_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

import json

In [2]:
df = pd.read_csv('./Data/TrainCleaned.csv')

In [3]:
x = df.drop('status_group', axis =1)
y = df.status_group
xtrain, xtest, ytrain, ytest = train_test_split(x,y)

In [4]:
class ModelsList():
    def __init__(self, x,y):
        xtrain, xtest, ytrain, ytest = train_test_split(x,y)
        self.xtrain = xtrain
        self.xtest = xtest
        self.ytrain = ytrain
        self.ytest = ytest
        self.df = pd.DataFrame({'Model' : pd.Series(dtype='str'), 
                    'train_score' : pd.Series(dtype='float64'), 
                    'test_score': pd.Series(dtype='float64'),
                    'log_loss': pd.Series(dtype='float64'),
                    'params':pd.Series(dtype='O')})
    
    def update(self, estimator, name, fit = True, params = 'None'):
        if fit:
            estimator.fit(self.xtrain, self.ytrain)
        
        log_loss = -cross_val_score(estimator, self.xtrain, self.ytrain, scoring= 'neg_log_loss' ).mean()                        
        trainscore = estimator.score(self.xtrain, self.ytrain)
        testscore = estimator.score(self.xtest, self.ytest)
        model_to_add = [name, trainscore, testscore, log_loss, params]
        self.df.loc[len(self.df.index)] = model_to_add
        return self.df
    
    def delete_last(self):
        self.df.drop(self.df.tail(1).index, inplace=True)

In [5]:
models_list = ModelsList(x,y)

In [6]:
subpipenum = Pipeline([
    ('num_impute',SimpleImputer(add_indicator=True)),
    ('ss', StandardScaler())
])

subpipecat = Pipeline([
    ('cat_impute', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])

In [7]:
ct = ColumnTransformer(transformers = [
    ('subpipe_num',subpipenum, selector(dtype_include=np.number)),
    ('subpipe_cat', subpipecat, selector(dtype_include=object))
], remainder='passthrough',n_jobs = 12)

In [8]:
model = Pipeline([
    ('ct', ct),
    ('fsm', LogisticRegression(random_state=327, n_jobs=12))
])

In [9]:
%%time
#roughly 1m
models_list.update(model, 'baseline')

Wall time: 46.2 s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,baseline,0.825567,0.776229,0.564681,


In [10]:
pd.DataFrame.from_dict(model.get_params(), orient='index').index

Index(['memory', 'steps', 'verbose', 'ct', 'fsm', 'ct__n_jobs',
       'ct__remainder', 'ct__sparse_threshold', 'ct__transformer_weights',
       'ct__transformers', 'ct__verbose', 'ct__subpipe_num', 'ct__subpipe_cat',
       'ct__subpipe_num__memory', 'ct__subpipe_num__steps',
       'ct__subpipe_num__verbose', 'ct__subpipe_num__num_impute',
       'ct__subpipe_num__ss', 'ct__subpipe_num__num_impute__add_indicator',
       'ct__subpipe_num__num_impute__copy',
       'ct__subpipe_num__num_impute__fill_value',
       'ct__subpipe_num__num_impute__missing_values',
       'ct__subpipe_num__num_impute__strategy',
       'ct__subpipe_num__num_impute__verbose', 'ct__subpipe_num__ss__copy',
       'ct__subpipe_num__ss__with_mean', 'ct__subpipe_num__ss__with_std',
       'ct__subpipe_cat__memory', 'ct__subpipe_cat__steps',
       'ct__subpipe_cat__verbose', 'ct__subpipe_cat__cat_impute',
       'ct__subpipe_cat__ohe', 'ct__subpipe_cat__cat_impute__add_indicator',
       'ct__subpipe_cat__cat_i

In [11]:
params = {
    'ct__subpipe_num__num_impute__strategy' : ['mean','median'],
    'fsm__solver' : ['liblinear','lbfgs','newton-cg', 'saga'],
    'fsm__max_iter': [100,1000,10000],
    'fsm__C' : [0.0001, 0.001, 0.01, 0.1, 1],
    'fsm__tol' : [0.0001, 0.001, 0.01]
}

In [12]:
gs = GridSearchCV(
    estimator= model,
    param_grid= params,
    cv =5,
    verbose = 2,
    n_jobs = 12
)

In [13]:
%%time
# ROUGHLY 33 MINS
gs.fit(xtrain,ytrain)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   14.1s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  1.6min
[Parallel(n_jobs=12)]: Done 341 tasks      | elapsed:  4.0min
[Parallel(n_jobs=12)]: Done 624 tasks      | elapsed:  7.6min
[Parallel(n_jobs=12)]: Done 989 tasks      | elapsed: 15.7min
[Parallel(n_jobs=12)]: Done 1434 tasks      | elapsed: 21.2min
[Parallel(n_jobs=12)]: Done 1800 out of 1800 | elapsed: 30.0min finished


Wall time: 30min 3s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=12,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x00000210066EC700>),
                                                                        ('subpipe_cat',
       

In [14]:
gsbest = gs.best_estimator_

In [15]:
%%time
#roughly 30s
models_list.update(gsbest, 'Baseline Best Grid Search', False, gs.best_params_)



Wall time: 23.2 s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,baseline,0.825567,0.776229,0.564681,
1,Baseline Best Grid Search,0.828732,0.824512,0.547725,{'ct__subpipe_num__num_impute__strategy': 'mea...


In [16]:
# models_list.df

In [17]:
# categorical_indices = [1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22]
# sampling_strategy_dict = {'non functional':18200,
#                           'functional needs repair':18200,
#                           'functional': 24169
#                          }
# sm = SMOTENC(sampling_strategy = sampling_strategy_dict, categorical_features=categorical_indices, random_state = 327)

In [18]:
imbpipe = ImPipeline([
    ('ct', ct),
    ('sm', SMOTE(random_state = 327)),
    ('gs_best', LogisticRegression(random_state = 327, C=1,max_iter=1000, solver='liblinear',tol=.0001))
])

In [19]:
%%time
#roughly 1m
models_list.update(imbpipe, 'Baseline Best Grid Search + Smote')

Wall time: 1min 26s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,baseline,0.825567,0.776229,0.564681,
1,Baseline Best Grid Search,0.828732,0.824512,0.547725,{'ct__subpipe_num__num_impute__strategy': 'mea...
2,Baseline Best Grid Search + Smote,0.834074,0.734141,0.621575,


In [20]:
imbpipe2 = ImPipeline([
    ('ct', ct),
    ('sm', SMOTE(random_state = 327)),
    ('baseline', LogisticRegression(random_state = 327))
])

In [21]:
%%time
#roughly 20s
imbpipe2.fit(xtrain,ytrain)

Wall time: 17 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('ct',
                 ColumnTransformer(n_jobs=12, remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer(add_indicator=True)),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000210066EC700>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='most_frequent')),


In [22]:
params = {
    'ct__subpipe_num__num_impute__strategy' : ['mean','median'],
    'baseline__solver' : ['liblinear','lbfgs','newton-cg', 'saga'],
    'baseline__max_iter': [100,1000,10000],
    'baseline__C' : [0.0001, 0.001, 0.01, 0.1, 1],
    'baseline__tol' : [0.0001, 0.001, 0.01]
}
gs2 = GridSearchCV(
    estimator= imbpipe2,
    param_grid= params,
    cv =5,
    verbose = 2,
    n_jobs = 12
)

In [23]:
%%time
#roughly 30m
gs2.fit(xtrain,ytrain)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   45.9s
[Parallel(n_jobs=12)]: Done 138 tasks      | elapsed:  4.3min
[Parallel(n_jobs=12)]: Done 341 tasks      | elapsed: 10.9min
[Parallel(n_jobs=12)]: Done 624 tasks      | elapsed: 19.5min
[Parallel(n_jobs=12)]: Done 989 tasks      | elapsed: 31.7min
[Parallel(n_jobs=12)]: Done 1434 tasks      | elapsed: 52.2min
[Parallel(n_jobs=12)]: Done 1800 out of 1800 | elapsed: 79.0min finished


Wall time: 1h 19min 49s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(n_jobs=12,
                                                          remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer(add_indicator=True)),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x00000210066EC700>),
                                                                        ('subpipe_cat',
       

In [24]:
gs2best = gs2.best_estimator_

In [25]:
models_list.update(gs2best, 'Smote Grid Search', False, gs2.best_params_)

Unnamed: 0,Model,train_score,test_score,log_loss,params
0,baseline,0.825567,0.776229,0.564681,
1,Baseline Best Grid Search,0.828732,0.824512,0.547725,{'ct__subpipe_num__num_impute__strategy': 'mea...
2,Baseline Best Grid Search + Smote,0.834074,0.734141,0.621575,
3,Smote Grid Search,0.826397,0.819461,0.625927,"{'baseline__C': 1, 'baseline__max_iter': 100, ..."


In [27]:
%%time
models_list.update(gs2best, 'Smote Grid Search', False, gs2.best_params_)

Wall time: 2min 19s


Unnamed: 0,Model,train_score,test_score,log_loss,params
0,baseline,0.825567,0.776229,0.564681,
1,Baseline Best Grid Search,0.828732,0.824512,0.547725,{'ct__subpipe_num__num_impute__strategy': 'mea...
2,Baseline Best Grid Search + Smote,0.834074,0.734141,0.621575,
3,Smote Grid Search,0.826397,0.819461,0.625927,"{'baseline__C': 1, 'baseline__max_iter': 100, ..."


In [37]:
pca = Pipeline([
    ('ct', ct),
    ('pca', PCA(random_state = 327)),
    ('classifier', RandomForestClassifier(random_state=327))
])

In [38]:
pca.fit(xtrain,ytrain)

TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.