In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("drive/My Drive/Capstone Data")

In [3]:
import pandas as pd
df=pd.read_csv("MSME Data Modified_v1.10_clean_withoutNA_ImpFeatures10.csv")

In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [None]:
# do not show any warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32191 entries, 0 to 32190
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        32191 non-null  int64  
 1   branch_name       32191 non-null  float64
 2   ltv               32191 non-null  float64
 3   disbursed_amount  32191 non-null  int64  
 4   asset_cost        32191 non-null  int64  
 5   district_name     32191 non-null  float64
 6   region_name       32191 non-null  float64
 7   industry_name     32191 non-null  float64
 8   bsr_activity_cd   32191 non-null  int64  
 9   state_name        32191 non-null  float64
 10  bsr_org_cd        32191 non-null  float64
 11  loan_default      32191 non-null  int64  
dtypes: float64(7), int64(5)
memory usage: 2.9 MB


In [6]:
#c = 11
X=df.iloc[:,:11]
#X=df.drop(df.columns[], axis=1)
y=df['loan_default'].astype(int)

In [7]:

# Choose between the two scalers:
# scaler = RobustScaler()
scaler = StandardScaler()  

C_values = [0.001, 0.01, 0.05, 0.1, 1., 100.]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

###########################################
# Version A: Proper scaling with pipeline #
###########################################

param_grid = {'logisticregression__C': C_values}

logReg = LogisticRegression(fit_intercept=True, 
                            penalty='l1', 
                            solver='liblinear', 
                            tol=0.0001, 
                            max_iter=1000, 
                            random_state=0)

# Create a pipeline that scales, then runs logistic regression
pipeline = make_pipeline(scaler, logReg)

vA = GridSearchCV(pipeline, param_grid=param_grid,
                     scoring='roc_auc', cv=10, refit=True)
vA.fit(X_train, y_train)

# Get coefficients
coefA = vA.best_estimator_.named_steps['logisticregression'].coef_

###############################
# Version B: Improper scaling #     
###############################

param_grid = {'C': C_values}

X_train_scaled = scaler.fit_transform(X_train)

vB = GridSearchCV(logReg, param_grid=param_grid,
                     scoring='roc_auc', cv=10, refit=True)
vB.fit(X_train_scaled, y_train)

# Get coefficients
coefB = vB.best_estimator_.coef_


# Compare coefficients
# (Assertion will pass for StandardScaler, but 
# fail for RobustScaler)
assert np.array_equal(coefA, coefB)



In [9]:
# Saving model to disk
import pickle
pickle.dump(vA, open('model_log_reg_hyper_1.pkl','wb'))
pickle.dump(vA, open('model_log_reg_hyper_scaled_2.pkl','wb'))

In [12]:
from sklearn.metrics import classification_report, confusion_matrix

Using model A

In [16]:
pred = vA.predict(X_train)
confusion_matrix(y_train, vA.predict(X_train))

array([[18836,     1],
       [ 5246,    60]])

In [18]:
print(classification_report(y_train, pred))

              precision    recall  f1-score   support

           0       0.78      1.00      0.88     18837
           1       0.98      0.01      0.02      5306

    accuracy                           0.78     24143
   macro avg       0.88      0.51      0.45     24143
weighted avg       0.83      0.78      0.69     24143



In [14]:
pred = vA.predict(X_test)
confusion_matrix(y_test, vA.predict(X_test))

array([[6314,    0],
       [1704,   30]])

In [15]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.88      6314
           1       1.00      0.02      0.03      1734

    accuracy                           0.79      8048
   macro avg       0.89      0.51      0.46      8048
weighted avg       0.83      0.79      0.70      8048



Using Model B

In [20]:
pred = vB.predict(X_train)
confusion_matrix(y_train, vB.predict(X_train))

array([[    0, 18837],
       [    0,  5306]])

In [21]:
print(classification_report(y_train, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     18837
           1       0.22      1.00      0.36      5306

    accuracy                           0.22     24143
   macro avg       0.11      0.50      0.18     24143
weighted avg       0.05      0.22      0.08     24143



  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
pred = vB.predict(X_test)
confusion_matrix(y_test, vB.predict(X_test))

array([[   0, 6314],
       [   0, 1734]])

In [23]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      6314
           1       0.22      1.00      0.35      1734

    accuracy                           0.22      8048
   macro avg       0.11      0.50      0.18      8048
weighted avg       0.05      0.22      0.08      8048



  _warn_prf(average, modifier, msg_start, len(result))
