# Imports and loading in cleaned data

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier,XGBRegressor
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

C:\Users\Kev\anaconda3\envs\learn-env\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\Kev\anaconda3\envs\learn-env\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('./data/cleaned_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45113 entries, 0 to 45112
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                45113 non-null  int64  
 1   Month                     45113 non-null  object 
 2   Age                       45113 non-null  float64
 3   Annual_Income             45113 non-null  float64
 4   Monthly_Inhand_Salary     45113 non-null  float64
 5   Num_Bank_Accounts         45113 non-null  int64  
 6   Num_Credit_Card           45113 non-null  int64  
 7   Interest_Rate             45113 non-null  int64  
 8   Num_of_Loan               45113 non-null  float64
 9   Delay_from_due_date       45113 non-null  int64  
 10  Num_of_Delayed_Payment    45113 non-null  float64
 11  Num_Credit_Inquiries      44239 non-null  float64
 12  Credit_Mix                45113 non-null  object 
 13  Outstanding_Debt          45113 non-null  float64
 14  Credit

In [4]:
df['Credit_Score'].value_counts()

Standard    24916
Poor        13407
Good         6790
Name: Credit_Score, dtype: int64

# Test Train Split 

In [5]:
X = df.drop(['Credit_Score','Unnamed: 0','Month','Delay_from_due_date','Num_of_Delayed_Payment','Monthly_Balance','Interest_Rate'], axis=1)
y = df['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=.2)

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45113 entries, 0 to 45112
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       45113 non-null  float64
 1   Annual_Income             45113 non-null  float64
 2   Monthly_Inhand_Salary     45113 non-null  float64
 3   Num_Bank_Accounts         45113 non-null  int64  
 4   Num_Credit_Card           45113 non-null  int64  
 5   Num_of_Loan               45113 non-null  float64
 6   Num_Credit_Inquiries      44239 non-null  float64
 7   Credit_Mix                45113 non-null  object 
 8   Outstanding_Debt          45113 non-null  float64
 9   Credit_Utilization_Ratio  45113 non-null  float64
 10  Credit_History_Age        40948 non-null  float64
 11  Payment_of_Min_Amount     45113 non-null  object 
 12  Amount_invested_monthly   45113 non-null  float64
dtypes: float64(9), int64(2), object(2)
memory usage: 4.5+ MB


In [7]:
print(X_train.shape, X_test.shape)

(36090, 13) (9023, 13)


In [8]:
X.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly
0,23.0,19114.12,1824.843333,3,4,4.0,4.0,other,809.98,26.82262,22.0,No,80.415295
1,23.0,19114.12,1824.843333,3,4,4.0,4.0,Good,809.98,22.537593,22.0,No,178.344067
2,23.0,19114.12,1824.843333,3,4,4.0,4.0,Good,809.98,23.933795,,No,24.785217
3,28.0,34847.84,3037.986667,2,4,1.0,2.0,Good,605.03,24.464031,26.0,No,104.291825
4,28.0,34847.84,3037.986667,2,4,1.0,2.0,Good,605.03,38.550848,26.0,No,40.391238


# Preprocessing Pipeline

In [9]:
#subpipes that scale numeric data and use one hot encoder on categorical 
subpipe_num = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [10]:
#Create a column transformer to apply the subpipes and transform the data
CT = ColumnTransformer(transformers=[
    ('subpipe_num', subpipe_num, selector(dtype_include=np.number)),
     ('subpipe_cat', subpipe_cat, selector(dtype_include=object))], remainder='passthrough')

# Baseline Dummy Model

In [11]:
#Create a pipeline for dummy model using most_freq strategy
dummy_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

In [12]:
dummy_model_pipe.fit(X_train, y_train)
dummy_model_pipe.score(X_train,y_train)

0.5523690773067331

# Final Chosen Model XGB

In [11]:
xgb_final_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(
                    max_depth=6,
                    min_child_weight=.1,
                    reg_alpha=1,
                    n_estimators=300,
                    learning_rate=.095,
                    max_delta_step=.065,
                              ))])

In [12]:
xgb_final_model.fit(X_train,y_train)

  mode = stats.mode(array)
  mode = stats.mode(array)




Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001B8C447A7F0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                        

In [13]:
xgb_final_model.score(X_train,y_train)

0.7507896924355777

In [14]:
xgb_final_model.score(X_test,y_test)

0.73290479884739

In [35]:
y_pred = xgb_final_model.predict(X_test)

In [36]:
print("Accuracy Score : ",accuracy_score(y_test, y_pred, 
                                           ))
print("Precision Score : ",precision_score(y_test, y_pred, 
                                           average='weighted'))
print("Recall Score : ",recall_score(y_test, y_pred, 
                                           average='weighted'))
print("f1 Score : ",f1_score(y_test, y_pred, 
                                           average='weighted'))

Accuracy Score :  0.73290479884739
Precision Score :  0.734212561278144
Recall Score :  0.73290479884739
f1 Score :  0.7331002644866886


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        Good       0.58      0.62      0.60      1323
        Poor       0.74      0.69      0.71      2719
    Standard       0.77      0.79      0.78      4981

    accuracy                           0.73      9023
   macro avg       0.70      0.70      0.70      9023
weighted avg       0.73      0.73      0.73      9023



In [38]:
import pickle
pickle.dump(xgb_final_model, open("streamlit_model.sav", 'wb'))

In [39]:
loaded_model = pickle.load(open("streamlit_model.sav", 'rb'))

In [40]:
loaded_model.score(X_test, y_test)

0.73290479884739

In [41]:
X_test.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly
40190,53.0,26420.21,2462.684167,8,3,1.0,1.0,other,289.24,30.621614,33.0,No,32.294931
5245,33.0,18827.29,1484.940833,9,6,3.0,10.0,Bad,1370.06,27.681395,,Yes,41.890857
38919,27.0,41848.26,3578.355,8,7,1.0,2.0,Standard,616.98,23.211877,32.0,No,154.123782
3401,36.0,58537.38,4688.115,7,5,4.0,,other,259.63,24.096692,23.0,No,93.471162
20178,37.0,59665.2,4870.1,6,6,4.0,10.0,Bad,2288.25,31.330589,9.0,Yes,58.20179


In [42]:
X_test[1:2]

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Num_of_Loan,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Amount_invested_monthly
5245,33.0,18827.29,1484.940833,9,6,3.0,10.0,Bad,1370.06,27.681395,,Yes,41.890857


In [43]:
X_test.columns

Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Num_of_Loan', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Amount_invested_monthly'],
      dtype='object')

In [44]:
y_test[1:2]

5245    Poor
Name: Credit_Score, dtype: object

In [45]:
loaded_model.predict(X_test[1:2])

array(['Poor'], dtype=object)

In [46]:
used_columns = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Num_of_Loan', 'Num_Credit_Inquiries', 'Credit_Mix',
       'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Amount_invested_monthly']

In [47]:
example_row = [33,40000,2000,6,8,4,9,"Standard",3500,29,7,"Yes",100]

In [48]:
new_test_sample = pd.DataFrame(dict(zip(used_columns,example_row)),index=[0])

In [49]:
loaded_model.predict(new_test_sample)

array(['Poor'], dtype=object)

In [50]:
loaded_model.predict_proba(new_test_sample)

array([[0.03505686, 0.72929126, 0.23565188]], dtype=float32)