# Model Pipelines
- In this notebook, we create a custom class

In [4]:
# import dependencies

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, classification_report
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load in data:

In [5]:
# import cleaned data from cleaning.ipynb
df = pd.read_csv('data/features_cleaned.csv', index_col='Unnamed: 0').drop('id', axis=1)

In [6]:
# create features and target
X = df.drop('status_group', axis=1)
y = df.status_group

# encode y into labels [0, 1, 2]
le = LabelEncoder()

# 3-label multiclass targets
y_encoded = le.fit_transform(y)

In [8]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=0)

# Create sub-pipelines:

In [11]:
# get numeric and object columns
num_cols = []
cat_cols = []

for col in X.columns:
    if np.issubdtype(X[col].dtype, np.number):
        num_cols.append(col)
    else:
        cat_cols.append(col)

In [12]:
# create numeric and categorical subpipes for different column types
num_pipe = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean', add_indicator=True)),
    ('ss', StandardScaler())
])

cat_pipe = Pipeline(steps=[
    ('cat_impute', SimpleImputer(strategy='most_frequent', add_indicator=True)),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

In [13]:
# transform the columns

ct = ColumnTransformer(transformers=[
    
    ('nums', num_pipe, num_cols),
    ('cats', cat_pipe, cat_cols)
    
], remainder='passthrough', n_jobs=-1)

# Custom pipeline class:

In [9]:
class LogRegModel():
    
    '''
    Class that takes in an estimator and a name. The estimator is an imblearn.Pipeline
    
    Attributes:
    - model: the pipeline model from input. Subpipes and ColumnTransformer are pre-specified.
    - name: name of the model
    '''
    
    # master_df that holds results of all models. a class variable shared by all instances
    master_df = pd.DataFrame()
    
    
    
    def __init__(self, model, name, x_train, y_train):
        self.model = model
        self.name = name
        self.x = x_train
        self.y = y_train
    
    
    
    def grid_search(self, params):
        
        '''
        Performs a grid search given the parameters
        
        Parameters:
        - params: parameters to be searched in GridSearchCV
        
        Returns:
        - gs: GridSearchCV object
        '''
        
        # gs holds the GridSearchCV object
        gs = GridSearchCV(estimator=self.model, param_grid=params, verbose=2, n_jobs=-1)
        
        # perform grid search
        gs.fit(self.x, self.y)
        
        # put results and params into a result_df
        result_df = pd.DataFrame.from_dict(gs.cv_results_).drop(['mean_fit_time', 'std_fit_time', 'mean_score_time', 
                                                                 'std_score_time', 'params', 'split0_test_score', 
                                                                 'split1_test_score', 'split2_test_score', 'split3_test_score', 
                                                                 'split4_test_score', 'rank_test_score'], axis=1)
        # add model name
        result_df['name'] = self.name
        
        # add result df to master_df
        LogRegModel.master_df = pd.concat([LogRegModel.master_df, result_df])
        
        return gs
    
    
    
    # this method operates at the class level rather than the instance level
    @classmethod
    def get_master_df(cls):
        return cls.master_df

# Grid search 1:

In [10]:
# basic LogReg model
logreg_pipe = Pipeline([
    ('ct', ct),
    ('logreg', LogisticRegression(n_jobs=-1))
])

logreg_pipe.fit(X_train, y_train)

In [28]:
# this grid search took several hours to run
# it looks at many hyperparameters using the 21-feature dataset with ternary classification
# massive grid search (no SMOTE)

# params = {
#     'logreg__class_weight': [None, 'balanced', {0: 1, 1: 2, 2: 1}, {0: 1, 1: 3, 2: 1}, {0: 1, 1: 4, 2: 1}, {0: 1, 1: 5, 2: 1}, 
#                              {0: 1, 1: 10, 2: 1}], # trying many different class_weight values for class 1
#     'logreg__penalty': [None, 'l2', 'l1', 'elasticnet'],
#     'logreg__solver' : ['liblinear','lbfgs', 'newton', 'newton-cholesky', 'sag', 'saga'],
#     'logreg__max_iter': [100, 1000, 10000],
#     'logreg__C' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
#     'logreg__tol' : [0.0001, 0.001, 0.01, 0.1, 1]
# }

# 22680 different models
# model_1 = LogRegModel(logreg_pipe, 'no_smote_logreg', X_train, y_train)
# gs_1 = model_1.grid_search(params=params)

Fitting 5 folds for each of 22680 candidates, totalling 113400 fits


66083 fits failed out of a total of 113400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4725 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\heefj\anaconda3\envs\learn-env\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\heefj\anaconda3\envs\learn-env\lib\site-packages\imblearn\pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "C:\Users\heefj\anaconda3\envs\learn-env\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\heefj\anaconda3\envs\learn-env\lib\site-packages\sk



### Best params from grid search 1:
- 'logreg__C': 1,
- 'logreg__class_weight': None
- 'logreg__max_iter': 100
- 'logreg__penalty': 'l2'
- 'logreg__solver': 'lbfgs'
- 'logreg__tol': 0.0001

In [12]:
# test and training scores for best model

gs1_best_model = Pipeline([
    ('ct', ct),
    ('logreg', LogisticRegression(C=1, class_weight=None, max_iter=100, penalty='l2', solver='lbfgs', tol=0.0001))
])

gs1_best_model.fit(X_train, y_train)

gs1_best_model.score(X_train, y_train), gs1_best_model.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.7480429292929293, 0.7493265993265993)

- Best accuracy is 74.9%. This is worse than the vanilla Logistic Regression

In [101]:
# store class df in models_df
models_df = LogRegModel.get_master_df()

# drop nulls as some of the params are incompatible
models_df = models_df[models_df['mean_test_score'].notna()]

In [55]:
# look at most accurate models
models_df.sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,param_logreg__C,param_logreg__class_weight,param_logreg__max_iter,param_logreg__penalty,param_logreg__solver,param_logreg__tol,mean_test_score,std_test_score,name
10118,1,,100,l2,lbfgs,0.1,0.744655,0.005603,no_smote_logreg
10119,1,,100,l2,lbfgs,1.0,0.744655,0.005603,no_smote_logreg
10117,1,,100,l2,lbfgs,0.01,0.744655,0.005603,no_smote_logreg
10116,1,,100,l2,lbfgs,0.001,0.744655,0.005603,no_smote_logreg
10115,1,,100,l2,lbfgs,0.0001,0.744655,0.005603,no_smote_logreg


In [33]:
# Export to save
# models_df.to_csv('models.csv')

## Analyze parameters from grid search 1:
- 21 features
- Ternary classification

### C:

In [73]:
# look at mean and median test scores for all C values
models_df.groupby(['param_logreg__C']).mean().merge(models_df.groupby(['param_logreg__C']).median(), on='param_logreg__C')

Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0001,0.616442,0.005616,0.672496,0.003137
0.001,0.670633,0.003925,0.691561,0.003594
0.01,0.686274,0.004025,0.707786,0.003783
0.1,0.689897,0.004474,0.709207,0.004249
1.0,0.689662,0.004621,0.706618,0.004468
10.0,0.689541,0.004598,0.706292,0.004443
100.0,0.689564,0.004636,0.706218,0.004425
1000.0,0.689586,0.004731,0.706176,0.004496
10000.0,0.689511,0.004704,0.706166,0.00447


- For both mean and median, higher C values (corresponding to less regularization) performed better. This makes sense, as we have yet to overfit a model. Once we get to C=0.1, the results begin to plateau, and even slightly drop.
- Going to focus on values around 0.1 in future grid searches.

### Class weight:

In [88]:
# convert class weights from dict -> str
models_df['param_logreg__class_weight'] = models_df['param_logreg__class_weight'].astype(str)

# look at mean and median test sclass_weightores for all class_weight values
models_df.groupby(['param_logreg__class_weight']).mean().merge(models_df.groupby(['param_logreg__class_weight']).median(), on='param_logreg__class_weight')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  models_df['param_logreg__class_weight'] = models_df['param_logreg__class_weight'].astype(str)


Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__class_weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
balanced,0.636152,0.00725,0.632744,0.004907
,0.727151,0.0038,0.737963,0.004348
"{0: 1, 1: 10, 2: 1}",0.569922,0.005067,0.577715,0.003642
"{0: 1, 1: 2, 2: 1}",0.724178,0.003833,0.735532,0.004206
"{0: 1, 1: 3, 2: 1}",0.714857,0.003906,0.722706,0.004216
"{0: 1, 1: 4, 2: 1}",0.700438,0.003985,0.703199,0.003844
"{0: 1, 1: 5, 2: 1}",0.680306,0.004306,0.678577,0.00401


- Above, I modified the class weight of the minority class (class 1).
- No class weight (nan), and class 1 weights of '2' and '3' gave the best results.
- As the class weight of class 1 increased beyond '3', performance started to drop. 'Balance' weights also performed poorly.

### Max iterations:

In [75]:
# look at mean and median test scores for all max_iter values
models_df.groupby(['param_logreg__max_iter']).mean().merge(models_df.groupby(['param_logreg__max_iter']).median(), on='param_logreg__max_iter')

Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__max_iter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,0.6791,0.004536,0.703283,0.004069
1000,0.678927,0.004621,0.703114,0.004151
10000,0.67899,0.004621,0.702357,0.004079


- Results were pretty comparable for all values of max_iter. It seems that as long as we hit 100 iterations, our model reaps the majority of accuracy. This will be useful knowledge for the next grid search.

### Penalty (regularization):

In [76]:
# look at mean and median test scores for all penalty values
models_df.groupby(['param_logreg__penalty']).mean().merge(models_df.groupby(['param_logreg__penalty']).median(), on='param_logreg__penalty')

Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__penalty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
l1,0.670043,0.005265,0.704019,0.004198
l2,0.682197,0.004206,0.703367,0.003909


- Only l1 and l2 were used, as 'elasticnet' wasn't passed an l1_ratio argument.
- Results were pretty comparable for both values of penalty. Plan on incorporating both in future grid searches.
- For some reason when penalty==None, the test_score values were null. Not sure why this happened, as many of the solvers are compatible with no penalty.

### Solver (optimization algorithm):

In [83]:
# look at mean and median test scores for all solver values
models_df.groupby(['param_logreg__solver']).mean().merge(models_df.groupby(['param_logreg__solver']).median(), on='param_logreg__solver')

Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__solver,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lbfgs,0.680324,0.004279,0.702041,0.004458
liblinear,0.706486,0.003961,0.721338,0.004414
newton-cholesky,0.679508,0.003656,0.701578,0.003641
sag,0.673725,0.005359,0.700979,0.004293
saga,0.66316,0.005024,0.690993,0.003896


- '__liblinear__' had the best mean and median score by a noticeable margin. I did forget to specify 'multinomial' in the LogisticRegression instantiation. Liblinear uses a binary approach for ternary problems, so the results may not actually be the best.
- In the next series of grid searches, I will specify multi_class='multinomial'.
- The only solver i did not try was 'newton-cg'. I plan on using it in next grid search

### Tolerance:

In [84]:
# look at mean and median test scores for all tol values
models_df.groupby(['param_logreg__tol']).mean().merge(models_df.groupby(['param_logreg__tol']).median(), on='param_logreg__tol')

Unnamed: 0_level_0,mean_test_score_x,std_test_score_x,mean_test_score_y,std_test_score_y
param_logreg__tol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0001,0.683159,0.00438,0.703335,0.004333
0.001,0.683246,0.004438,0.704356,0.004364
0.01,0.682155,0.004262,0.704377,0.004067
0.1,0.678672,0.004094,0.701989,0.003835
1.0,0.667795,0.005788,0.69396,0.003883


- Lower vals of tolerance seemed to perform better. 0.001 and 0.01 doing the best.

# Grid search 2:

In [90]:
# LogReg model 2
logreg_pipe2 = Pipeline([
    ('ct', ct),
    ('logreg2', LogisticRegression(n_jobs=-1, multi_class='multinomial'))
])

logreg_pipe2.fit(X_train, y_train)

In [91]:
# massive grid search (no SMOTE)
# trying many different class_weight values for class 1

params = {
    'logreg__class_weight': [None, {0: 1, 1: 1.5, 2: 1}, {0: 1, 1: 2, 2: 1}],
    'logreg__penalty': [None, 'l2'],
    'logreg__solver' : ['lbfgs', 'newton-cg', 'sag', 'saga'], # multi-class solvers
    'logreg__max_iter': [100],
    'logreg__C' : [0.05, 0.1, 0.5],
    'logreg__tol' : [0.001, 0.01]
}

In [92]:
model_2 = LogRegModel(logreg_pipe, 'no_smote_logreg2', X_train, y_train)

In [93]:
# 144 different models
gs_2 = model_2.grid_search(params=params)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [94]:
gs_2.best_params_

{'logreg__C': 0.5,
 'logreg__class_weight': {0: 1, 1: 1.5, 2: 1},
 'logreg__max_iter': 100,
 'logreg__penalty': 'l2',
 'logreg__solver': 'newton-cg',
 'logreg__tol': 0.01}

### Best params from grid search 2:
- 'logreg__C': 0.5,
- 'logreg__class_weight': {0: 1, 1: 1.5, 2: 1}
- 'logreg__max_iter': 100
- 'logreg__penalty': 'l2'
- 'logreg__solver': 'newton-cg'
- 'logreg__tol': 0.01

In [95]:
gs_2.best_estimator_.score(X_train, y_train), gs_2.best_estimator_.score(X_test, y_test)

(0.748169191919192, 0.747979797979798)

- Best accuracy is right under 75%. These results are actually slightly worse than grid search 1.

In [97]:
# store class df in models_df
models_df = LogRegModel.get_master_df()

# drop nulls as some of the params are incompatible
models_df = models_df[models_df['mean_test_score'].notna()]
models_df.shape

(9595, 9)

In [99]:
# Export to save
models_df.to_csv('models.csv')

### ternary classification:
- "functional" is class 0
- "functional needs repair" is class 1
- "non functional" is class 2

### binary classification:
- "functional" is class 0
- "functional needs repair" and "non functional" are class 1

In [None]:
# combining 'functional needs repair' and 'non functional' to get a 2-label binary target
y_2_labels = y_encoded.copy()
y_2_labels[y_2_labels == 2] = 1

# 3-label targets, 2-label targets
set(y_3_labels), set(y_2_labels)