In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier,XGBRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('./data/cleaned_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39898 entries, 0 to 39897
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                39898 non-null  int64  
 1   Age                       39898 non-null  float64
 2   Occupation                39898 non-null  object 
 3   Annual_Income             39898 non-null  float64
 4   Monthly_Inhand_Salary     39898 non-null  float64
 5   Num_Bank_Accounts         39898 non-null  int64  
 6   Num_Credit_Card           39898 non-null  int64  
 7   Interest_Rate             39898 non-null  int64  
 8   Num_of_Loan               39898 non-null  float64
 9   Delay_from_due_date       39898 non-null  int64  
 10  Num_of_Delayed_Payment    39898 non-null  float64
 11  Changed_Credit_Limit      39898 non-null  float64
 12  Num_Credit_Inquiries      39898 non-null  float64
 13  Credit_Mix                39898 non-null  object 
 14  Outsta

In [4]:
X = df.drop(['Credit_Score','Unnamed: 0'], axis=1)
y = df['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=.2)

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39898 entries, 0 to 39897
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       39898 non-null  float64
 1   Occupation                39898 non-null  object 
 2   Annual_Income             39898 non-null  float64
 3   Monthly_Inhand_Salary     39898 non-null  float64
 4   Num_Bank_Accounts         39898 non-null  int64  
 5   Num_Credit_Card           39898 non-null  int64  
 6   Interest_Rate             39898 non-null  int64  
 7   Num_of_Loan               39898 non-null  float64
 8   Delay_from_due_date       39898 non-null  int64  
 9   Num_of_Delayed_Payment    39898 non-null  float64
 10  Changed_Credit_Limit      39898 non-null  float64
 11  Num_Credit_Inquiries      39898 non-null  float64
 12  Credit_Mix                39898 non-null  object 
 13  Outstanding_Debt          39898 non-null  float64
 14  Credit

In [6]:
print(X_train.shape, X_test.shape)

(31918, 20) (7980, 20)


In [7]:
y.value_counts()

2    20650
1    13281
3     5967
Name: Credit_Score, dtype: int64

In [8]:
#subpipes that scale numeric data and use one hot encoder on categorical 
subpipe_num = Pipeline(steps=[
    
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [9]:
#Create a column transformer to apply the subpipes and transform the data
CT = ColumnTransformer(transformers=[
    ('subpipe_num', subpipe_num, selector(dtype_include=np.number)),
     ('subpipe_cat', subpipe_cat, selector(dtype_include=object))], remainder='passthrough')

In [10]:
#Use modelwithcv function to get cross val scores
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

# Baseline Dummy Model

In [11]:
#Create a pipeline for dummy model using most_freq strategy
dummy_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

In [12]:
dummy_model_pipe.fit(X_train, y_train)
dummy_model_pipe.score(X_train,y_train)

0.5171376652672474

# Simple Untuned Models 

In [77]:
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    LogisticRegression(random_state=42),
    XGBClassifier(random_state=42,use_label_encoder=False,learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27)
    ]

In [79]:
for classifier in classifiers:
    steps = [
        ('ct', CT),
        ('clf', classifier)
    ]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipeline.score(X_train, y_train))
    print("model score: %.3f" % pipeline.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.808
model score: 0.622
DecisionTreeClassifier(random_state=42)
model score: 1.000
model score: 0.681
RandomForestClassifier(random_state=42)
model score: 1.000
model score: 0.775
AdaBoostClassifier(random_state=42)
model score: 0.657
model score: 0.656
GradientBoostingClassifier(random_state=42)
model score: 0.725
model score: 0.715
LogisticRegression(random_state=42)
model score: 0.619
model score: 0.624




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)
model score: 0.852
model score: 0.750


# GridSearch LogReg Model

In [116]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [117]:
log_model = Pipeline(steps=[('CT', CT),
                              ('log', LogisticRegression(random_state=42))])

In [119]:
parameters = {"log__C": [.1,.01,.001,.0001],
              "log__solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              "log__multi_class":['auto', 'ovr'],
              "log__max_iter":[1000]
             }

In [120]:
gs_log = GridSearchCV(log_model,parameters, cv=5)

In [121]:
gs_log.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                

In [122]:
gs_log.best_params_

{'log__C': 0.1,
 'log__max_iter': 1000,
 'log__multi_class': 'auto',
 'log__solver': 'liblinear'}

In [123]:
gs_log_model = Pipeline(steps=[('CT', CT),
                              ('log', LogisticRegression(random_state=42,C=0.1,multi_class='auto',solver='liblinear',max_iter=1000))])

In [124]:
gs_log_model.fit(X_train,y_train)

Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CBA30>)])),
                ('log',
                 LogisticRegression(C=0.

In [125]:
gs_log_model.score(X_train,y_train)

0.6207155836831881

# GridSearch XGB 

In [126]:
XGBClassifier().get_params().keys()

dict_keys(['objective', 'use_label_encoder', 'base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'enable_categorical', 'gamma', 'gpu_id', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'monotone_constraints', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'predictor', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [138]:
xgb_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27))])

In [145]:
param_test1 = {
 'xgb__max_depth':range(3,10,2),
 'xgb__min_child_weight':range(1,6,2)
}

In [146]:
gs_xgb = GridSearchCV(estimator = xgb_model, param_grid = param_test1, scoring='f1_micro',n_jobs=-1,verbose = 10, cv=5)

gs_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done  52 out of  60 | elapsed: 21.1min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 24.7min finished




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                

In [147]:
gs_xgb.best_params_

{'xgb__max_depth': 7, 'xgb__min_child_weight': 1}

In [148]:
xgb_first_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=7,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27))])

In [149]:
xgb_first_model.fit(X_train,y_train)





Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer...
                               interaction_constraints='', learning_rate=0.1,
                               max_delt

In [150]:
xgb_first_model.score(X_train,y_train)

0.9991227520521336

In [151]:
xgb_first_model_val = ModelWithCV(xgb_first_model, 'xgb_first_model', X_train, y_train)
xgb_first_model_val.print_cv_summary()







































CV Results for `xgb_first_model` model:
            0.77408 ± 0.00533 accuracy
        


In [154]:
param_test2 = {
 'xgb__reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gs_xgb2 = GridSearchCV(estimator = xgb_first_model, param_grid = param_test2, scoring='f1_micro',n_jobs=-1,verbose = 10, cv=3)
gs_xgb2.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed:  3.4min remaining: 22.0min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  3.4min remaining:  9.3min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed:  3.4min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  3.5min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed:  4.1min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:  5.3min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.4min finished




GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                

In [155]:
gs_xgb2.best_params_

{'xgb__reg_alpha': 0.01}

In [157]:
xgb_second_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=7,
                    min_child_weight=1,
                    gamma=0,
                    reg_alpha=0.01,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27))])

In [158]:
xgb_second_model.fit(X_train,y_train)





Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer...
                               interaction_constraints='', learning_rate=0.1,
                               max_delt

In [159]:
xgb_second_model.score(X_train,y_train)

0.999154082335986

In [160]:
xgb_second_model_val = ModelWithCV(xgb_second_model, 'xgb_second_model', X_train, y_train)
xgb_second_model_val.print_cv_summary()







































CV Results for `xgb_second_model` model:
            0.77499 ± 0.00556 accuracy
        


In [17]:
xgb_third_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=.1,
                    max_delta_step=.5,
                    gamma=.5,
                    reg_alpha=1,
                    reg_lambda = .2,
                    subsample=.3,
                    colsample_bytree=.3,
                    objective='multi:softmax',
                    num_class=3,
                    seed=27))])

In [18]:
xgb_third_model.fit(X_train,y_train)




Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000021165AC4850>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer...
                               interaction_constraints='', learning_rate=0.1,
                               max_delt

In [19]:
xgb_third_model.score(X_train,y_train)

0.8705745974058525

In [20]:
xgb_third_model.score(X_test,y_test)

0.7543859649122807

# Gridsearch RFC

In [337]:
rfc_model = Pipeline(steps=[('CT', CT),
                              ('rfc',RandomForestClassifier(random_state=42))])

In [341]:
#Specified ranges for different parameters
params = {
    "rfc__n_estimators":[5,10,50,100,250,500],
    "rfc__max_depth":[2,4,8,16,32]}

In [342]:
rfc = GridSearchCV(rfc_model,params,cv=5)
rfc.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001EB4B2CB6D0>),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                

In [343]:
rfc.best_params_

{'rfc__max_depth': 32, 'rfc__n_estimators': 500}

In [13]:
rfc_model = Pipeline(steps=[('CT', CT),
                              ('rfc',RandomForestClassifier(random_state=42,n_estimators=1000,max_features='auto',max_depth=12,criterion='gini'))])

In [14]:
rfc_model.fit(X_train,y_train)

Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000021165AC4850>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000021165AC46A0>)])),
                ('rfc',
                 RandomForestClassifier(

In [15]:
rfc_model.score(X_train,y_train)

0.8012093489567016

In [16]:
rfc_model.score(X_test,y_test)

0.7288220551378446