# Model selection: <font color='#0041C2'>Elastic Net Logistic Regression</font>
---

- 1. Model 1 - No SMOTE + no dropping of columns
- 2. Model 2 - SMOTE + no dropping of columns
- 3. Model 3 - SMOTE + dropped columns

# Setting up the notebook

In [1]:
import pandas as pd
import numpy as np

from category_encoders import TargetEncoder
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score, roc_auc_score, make_scorer

from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer 

In [2]:
df_train = pd.read_csv("../dataset/train.csv")

y_train = df_train[["risk_flag"]]
x_train = df_train.drop("risk_flag", axis=1)

In [3]:
def run_pipeline(pipeline, x_train, y_train):
    stratified_kfold = StratifiedKFold(shuffle=True, n_splits=3, random_state=2021)

    scoring = {"accuracy": "accuracy",
               "recall": 'recall',
               "precision": "precision",
               "fbeta_2": make_scorer(fbeta_score, beta=2),
               "roc_auc": make_scorer(roc_auc_score),
              }

    scores = cross_validate(pipeline, x_train, y_train.values.ravel(), cv=stratified_kfold,
                           scoring = scoring)

    accuracy = [ val for val in scores['test_accuracy'] ]
    recall = [ val for val in scores['test_recall'] ]
    precision = [ val for val in scores['test_precision'] ]
    fbeta_2 = [ val for val in scores['test_fbeta_2'] ]
    auc = [ val for val in scores['test_roc_auc'] ]
    
    accuracy.append( sum(accuracy) / len(accuracy) )
    recall.append( sum(recall) / len(recall) )
    precision.append( sum(precision) / len(precision) )
    fbeta_2.append( sum(fbeta_2) / len(fbeta_2) )
    auc.append( sum(auc) / len(auc) )

    score_df = pd.DataFrame(data=[accuracy, recall, precision, fbeta_2, auc], columns=['Fold 1','Fold 2','Fold 3', 'Average'],
                            index=['Accuracy', 'Recall', 'Precision', 'Fbeta2', 'AUC'])
    display(score_df)

# Model 1 - No SMOTE + no dropping of columns

In [4]:
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder='passthrough')

pipeline1 = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['classifier', LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5)]]
                   )

run_pipeline(pipeline1, x_train, y_train)

  elif pd.api.types.is_categorical(cols):
  _warn_prf(average, modifier, msg_start, len(result))
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.877009,0.876935,0.876771,0.876905
Recall,0.0,0.002178,0.003145,0.001774
Precision,0.0,0.45,0.38806,0.279353
Fbeta2,0.0,0.002719,0.003924,0.002214
AUC,0.5,0.500902,0.501225,0.500709


# Model 2 - SMOTE + no dropping of columns

In [5]:
te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder='passthrough')

pipeline2 = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5)]]
                   )

run_pipeline(pipeline2, x_train, y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.615729,0.617946,0.618705,0.61746
Recall,0.544707,0.547907,0.53956,0.544058
Precision,0.16949,0.171125,0.16973,0.170115
Fbeta2,0.377545,0.380396,0.375794,0.377912
AUC,0.585198,0.587839,0.584683,0.585906


# Model 3 - SMOTE +  dropped columns

In [6]:
x_train.drop(['current_house_years', 'current_job_years', 'norent_noown', 'owned', 'marital_status'], axis=1, inplace=True)

te_features = ['profession', 'city','state']
scale_features = ['income','age','experience']

preprocessor = ColumnTransformer(transformers=[('te_features', TargetEncoder(), te_features),
                                               ('scale_features', MinMaxScaler(), scale_features)], remainder='passthrough')

pipeline3 = Pipeline(steps = [['preprocessor', preprocessor ],
                             ['smote', SMOTE(random_state=2021)],
                             ['classifier', LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0.5)]]
                   )

run_pipeline(pipeline3, x_train, y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,Fold 1,Fold 2,Fold 3,Average
Accuracy,0.612753,0.615327,0.618021,0.615367
Recall,0.548336,0.544641,0.542826,0.545268
Precision,0.168966,0.169325,0.17011,0.169467
Fbeta2,0.378411,0.377355,0.377433,0.377733
AUC,0.585062,0.584941,0.585697,0.585233


#### All results
|           | Model 1  | Model 2  | Model 3  |
|-----------|----------|----------|----------|
| Accuracy  | 0.876905 | 0.617460 | 0.615367 |
| Recall    | 0.001774 | 0.544058 | 0.545268 |
| Precision | 0.279353 | 0.170115 | 0.169467 |
| Fbeta2    | 0.002214 | 0.377912 | 0.377733 |
| AUC       | 0.500709 | 0.585906 | 0.585233 |