In [14]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer,  make_column_selector as selector
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    accuracy_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier,XGBRegressor
from sklearn.decomposition import PCA

In [32]:
df = pd.read_csv('./data/cleaned_data3.csv')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59835 entries, 0 to 59834
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                59835 non-null  int64  
 1   Age                       59835 non-null  float64
 2   Annual_Income             59835 non-null  float64
 3   Monthly_Inhand_Salary     50899 non-null  float64
 4   Num_Bank_Accounts         59835 non-null  int64  
 5   Num_Credit_Card           59835 non-null  int64  
 6   Interest_Rate             59835 non-null  int64  
 7   Num_of_Loan               59835 non-null  float64
 8   Delay_from_due_date       59835 non-null  int64  
 9   Num_of_Delayed_Payment    59835 non-null  float64
 10  Num_Credit_Inquiries      58688 non-null  float64
 11  Credit_Mix                59835 non-null  object 
 12  Outstanding_Debt          59835 non-null  float64
 13  Credit_Utilization_Ratio  59835 non-null  float64
 14  Credit

In [17]:
X = df.drop(['Credit_Score','Unnamed: 0'], axis=1)
y = df['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,test_size=.2)

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59835 entries, 0 to 59834
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       59835 non-null  float64
 1   Annual_Income             59835 non-null  float64
 2   Monthly_Inhand_Salary     50899 non-null  float64
 3   Num_Bank_Accounts         59835 non-null  int64  
 4   Num_Credit_Card           59835 non-null  int64  
 5   Interest_Rate             59835 non-null  int64  
 6   Num_of_Loan               59835 non-null  float64
 7   Delay_from_due_date       59835 non-null  int64  
 8   Num_of_Delayed_Payment    59835 non-null  float64
 9   Num_Credit_Inquiries      58688 non-null  float64
 10  Credit_Mix                59835 non-null  object 
 11  Outstanding_Debt          59835 non-null  float64
 12  Credit_Utilization_Ratio  59835 non-null  float64
 13  Credit_History_Age        54453 non-null  float64
 14  Paymen

In [19]:
print(X_train.shape, X_test.shape)

(47868, 18) (11967, 18)


In [20]:
y.value_counts()

2    31169
1    20435
3     8231
Name: Credit_Score, dtype: int64

In [21]:
#subpipes that scale numeric data and use one hot encoder on categorical 
subpipe_num = Pipeline(steps=[
    ('num_impute', SimpleImputer(strategy='mean')),
    ('ss', StandardScaler())
])


subpipe_cat = Pipeline(steps=[
    ('cat_impute',SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [22]:
#Create a column transformer to apply the subpipes and transform the data
CT = ColumnTransformer(transformers=[
    ('subpipe_num', subpipe_num, selector(dtype_include=np.number)),
     ('subpipe_cat', subpipe_cat, selector(dtype_include=object))], remainder='passthrough')

In [23]:
#Use modelwithcv function to get cross val scores
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds)
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} accuracy
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

# Baseline Dummy Model

In [24]:
#Create a pipeline for dummy model using most_freq strategy
dummy_model_pipe = Pipeline(steps=[
    ('ct', CT),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

In [25]:
dummy_model_pipe.fit(X_train, y_train)
dummy_model_pipe.score(X_train,y_train)

0.5219979944848333

# Simple Untuned Models 

In [26]:
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    LogisticRegression(random_state=42)]

In [27]:
for classifier in classifiers:
    steps = [
        ('ct', CT),
        ('clf', classifier)
    ]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)   
    print(classifier)
    print("model score: %.3f" % pipeline.score(X_train, y_train))
    print("model score: %.3f" % pipeline.score(X_test, y_test))

KNeighborsClassifier(n_neighbors=3)
model score: 0.802
model score: 0.632
DecisionTreeClassifier(random_state=42)
model score: 1.000
model score: 0.702
RandomForestClassifier(random_state=42)
model score: 1.000
model score: 0.798
AdaBoostClassifier(random_state=42)
model score: 0.655
model score: 0.662
GradientBoostingClassifier(random_state=42)
model score: 0.717
model score: 0.712
LogisticRegression(random_state=42)
model score: 0.601
model score: 0.605


# Gridsearch RFC

In [28]:
rfc_model = Pipeline(steps=[('CT', CT),
                              ('rfc',RandomForestClassifier(random_state=42,max_depth=10,n_estimators=200))])

In [29]:
rfc_model.fit(X_train,y_train)

Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EFE4F97520>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                        

In [30]:
rfc_model.score(X_train,y_train)

0.7622628896131027

In [31]:
rfc_model.score(X_test,y_test)

0.729506141890198

In [48]:
xgb_third_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=500,
                    max_depth=7,
                    min_child_weight=.1,
                    max_delta_step=.5,
                    gamma=.5,
                    reg_alpha=.01,
                    reg_lambda = .2,
                    subsample=.5,
                    colsample_bytree=.5,
                    objective='multi:softmax',
                    num_class=3,
                    seed=27))])

In [49]:
xgb_third_model.fit(X_train,y_train)



Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EFE4F97520>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                        

In [50]:
xgb_third_model.score(X_train,y_train)

0.9031503300743712

In [51]:
xgb_third_model.score(X_test,y_test)

0.786746887273335

In [52]:
xgb_gs_model = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(learning_rate=0.1,
                    n_estimators=250,
                    max_depth=7,
                    reg_alpha=.01,
                    reg_lambda = .2,
                    subsample=.5,
                    colsample_bytree=.5,
                    objective='multi:softmax',
                    num_class=3,
                    seed=27))])

In [53]:
param_test2 = {
     'xgb__min_child_weight':[.01,.1,.25,.5],
     'xgb__max_delta_step':[.01,.1,.25,.5],
     'xgb__gamma':[.01,.1,.25,.5],
}
gs_xgb2 = GridSearchCV(estimator = xgb_gs_model, param_grid = param_test2,n_jobs=-1,verbose = 10, cv=3)
gs_xgb2.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 17



GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('CT',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('num_impute',
                                                                                          SimpleImputer()),
                                                                                         ('ss',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x000001EFE4F97520>),
                                                                        ('subpipe_cat',
                                                                         Pipeline(steps=[('cat

In [54]:
gs_xgb2.best_params_

{'xgb__gamma': 0.25, 'xgb__max_delta_step': 0.5, 'xgb__min_child_weight': 0.1}

In [211]:
xgb_gs_model_1 = Pipeline(steps=[('CT', CT),
                              ('xgb', XGBClassifier(random_state=42,learning_rate=0.1,
                    n_estimators=300,
                    max_depth=5,
                    max_delta_step=.5,
                    gamma=.25,
                    min_child_weight=.5,
                    reg_alpha=.01,
                    reg_lambda = .2,
                    subsample=.5,
                    colsample_bytree=.5,
                    objective='multi:softmax',
                    num_class=3,
                    seed=27))])

In [212]:
xgb_gs_model_1.fit(X_train,y_train)



Pipeline(steps=[('CT',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('subpipe_num',
                                                  Pipeline(steps=[('num_impute',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001EFE4F97520>),
                                                 ('subpipe_cat',
                                                  Pipeline(steps=[('cat_impute',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                        

In [213]:
xgb_gs_model_1.score(X_train,y_train)

0.7873527199799448

In [214]:
xgb_gs_model_1.score(X_test,y_test)

0.7503969248767444