In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn import linear_model, tree, ensemble
#For training without cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv('../data/filled.csv')
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,33.779915
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,33.779915
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,33.779915
3,1,1500,73760.8172,101776.048741,DebtCon,Other,8.922268,0.0,0.0,179.766275,1.186055,21.296096,33.779915
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,33.779915


In [6]:
df.dtypes

BAD          int64
LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG      float64
DELINQ     float64
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [7]:
df.describe()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
count,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0,5960.0
mean,0.199497,18607.969799,73760.8172,101776.048741,8.922268,0.224329,0.405705,179.766275,1.186055,21.296096,33.779915
std,0.399656,11207.480417,42481.395689,56843.931566,7.239301,0.798458,1.079256,83.563059,1.653046,9.94828,7.632713
min,0.0,1100.0,2063.0,8000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.524499
25%,0.0,11100.0,48139.0,66489.5,3.0,0.0,0.0,117.37143,0.0,15.0,30.763159
50%,0.0,16300.0,69529.0,90000.0,8.0,0.0,0.0,178.076005,1.0,21.0,33.779915
75%,0.0,23300.0,88200.25,119004.75,12.0,0.0,0.0,227.143058,2.0,26.0,37.949892
max,1.0,89900.0,399550.0,855909.0,41.0,10.0,15.0,1168.233561,17.0,71.0,203.312149


In [8]:
df.isnull().sum()

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   int64  
 1   LOAN     5960 non-null   int64  
 2   MORTDUE  5960 non-null   float64
 3   VALUE    5960 non-null   float64
 4   REASON   5960 non-null   object 
 5   JOB      5960 non-null   object 
 6   YOJ      5960 non-null   float64
 7   DEROG    5960 non-null   float64
 8   DELINQ   5960 non-null   float64
 9   CLAGE    5960 non-null   float64
 10  NINQ     5960 non-null   float64
 11  CLNO     5960 non-null   float64
 12  DEBTINC  5960 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 605.4+ KB


In [10]:
cat_columns = ['REASON','JOB']

In [11]:
def create_dummy(dataframe, cat_vars):

    for var in cat_vars:
        cat_list = 'var'+'_'+'var'
        cat_list = pd.get_dummies(dataframe[var], prefix=var)
        dataframe_new = dataframe.join(cat_list)
        dataframe = dataframe_new
    data_vars = dataframe.columns.values.tolist()
    to_keep = [i for i in data_vars if i not in cat_vars]

    dataframe_final = dataframe[to_keep]

    return dataframe_final

In [12]:
data_to_model = create_dummy(df, cat_columns)

In [13]:
data_to_model.head(5)

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,REASON_DebtCon,REASON_HomeImp,JOB_Mgr,JOB_Office,JOB_Other,JOB_ProfExe,JOB_Sales,JOB_Self
0,1,1100,25860.0,39025.0,10.5,0.0,0.0,94.366667,1.0,9.0,33.779915,0,1,0,0,1,0,0,0
1,1,1300,70053.0,68400.0,7.0,0.0,2.0,121.833333,0.0,14.0,33.779915,0,1,0,0,1,0,0,0
2,1,1500,13500.0,16700.0,4.0,0.0,0.0,149.466667,1.0,10.0,33.779915,0,1,0,0,1,0,0,0
3,1,1500,73760.8172,101776.048741,8.922268,0.0,0.0,179.766275,1.186055,21.296096,33.779915,1,0,0,0,1,0,0,0
4,0,1700,97800.0,112000.0,3.0,0.0,0.0,93.333333,0.0,14.0,33.779915,0,1,0,1,0,0,0,0


In [14]:
#Creating features and targets and converting the data to arrays
#Labels that we want to predict
labels = np.array(data_to_model['BAD'])
#Create the features by dropping the label
features = data_to_model.drop('BAD', axis=1)

#Save feature names for later use
feature_list = list(features.columns)

#Convert to numpy arrays
features = np.array(features)

In [15]:
#Splitting the data into a train and validation set
#Model Score using KFold
#The folds are made by preserving the percentages of samples for each class
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

count = 1
#we use split method that will generate indices to split data into train and test sets

for train_index, test_index in kf.split(features,labels):
    print(f'Fold:{count}, Train set: {len(train_index)},Test set:{len(test_index)}')
    count+=1

Fold:1, Train set: 4768,Test set:1192
Fold:2, Train set: 4768,Test set:1192
Fold:3, Train set: 4768,Test set:1192
Fold:4, Train set: 4768,Test set:1192
Fold:5, Train set: 4768,Test set:1192


**RANDOM FOREST**

In [17]:
score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42),
                        features, labels, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.9135906  0.9261745  0.92533557 0.92197987 0.91610738]
Average score: 0.92


In [20]:
score = cross_val_score(ensemble.RandomForestClassifier(random_state= 42),
                        features, labels, cv= kf, scoring="roc_auc")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.96846758 0.97526778 0.97222883 0.97381657 0.96642587]
Average score: 0.97


**LOGISTIC REGRESSION**

In [18]:
score = cross_val_score(linear_model.LogisticRegression(random_state= 42),
                        features, labels, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.79781879 0.79865772 0.79949664 0.79949664 0.81208054]
Average score: 0.80


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
score = cross_val_score(linear_model.LogisticRegression(random_state= 42),
                        features, labels, cv= kf, scoring="roc_auc")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.63003189 0.64069464 0.64852985 0.65558991 0.70341308]
Average score: 0.66


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


**DECISION TREE CLASSIFIER**

In [19]:
score = cross_val_score(tree.DecisionTreeClassifier(random_state= 42),
                        features, labels, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.8590604  0.86157718 0.87080537 0.875      0.87416107]
Average score: 0.87


In [22]:
score = cross_val_score(tree.DecisionTreeClassifier(random_state= 42),
                        features, labels, cv= kf, scoring="roc_auc")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

Scores for each fold are: [0.7716206  0.77161619 0.80103236 0.80365291 0.80409128]
Average score: 0.79
