# Bake Off:  Default Predictions

In [163]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, auc
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA

In [3]:
df = pd.read_csv('train_data.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,2873,350000,1,1,2,37,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,3598,50000,2,2,1,37,2,2,2,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,27623,50000,2,1,2,23,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,6874,20000,1,3,1,56,0,0,0,0,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,6444,110000,2,2,2,32,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23999 entries, 0 to 23998
Data columns (total 25 columns):
ID                            23999 non-null int64
LIMIT_BAL                     23999 non-null int64
SEX                           23999 non-null int64
EDUCATION                     23999 non-null int64
MARRIAGE                      23999 non-null int64
AGE                           23999 non-null int64
PAY_0                         23999 non-null int64
PAY_2                         23999 non-null int64
PAY_3                         23999 non-null int64
PAY_4                         23999 non-null int64
PAY_5                         23999 non-null int64
PAY_6                         23999 non-null int64
BILL_AMT1                     23999 non-null int64
BILL_AMT2                     23999 non-null int64
BILL_AMT3                     23999 non-null int64
BILL_AMT4                     23999 non-null int64
BILL_AMT5                     23999 non-null int64
BILL_AMT6               

Change columns to lower case:

In [6]:
df.columns = map(str.lower, df.columns)

In [7]:
df.head()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default payment next month
0,2873,350000,1,1,2,37,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,3598,50000,2,2,1,37,2,2,2,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,27623,50000,2,1,2,23,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,6874,20000,1,3,1,56,0,0,0,0,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,6444,110000,2,2,2,32,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0


In [8]:
df.describe()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default payment next month
count,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,...,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0,23999.0
mean,15027.353931,167825.729405,1.603608,1.856786,1.550773,35.52323,-0.014959,-0.134839,-0.165632,-0.217092,...,43796.589775,40748.049169,39226.852411,5687.720113,6019.571,5253.322805,4824.490687,4825.033793,5245.642943,0.221176
std,8654.746941,129915.115993,0.489158,0.791939,0.522431,9.253986,1.120903,1.193044,1.193515,1.168719,...,64927.450125,61257.537699,60075.536736,16698.506577,24541.36,17967.02886,15730.753527,15596.580167,17795.775322,0.415047
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7506.0,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2392.5,1801.0,1280.0,1000.0,858.5,395.0,300.0,269.5,125.0,0.0
50%,15027.0,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19145.0,18227.0,17175.0,2110.0,2012.0,1809.0,1500.0,1500.0,1500.0,0.0
75%,22531.5,240000.0,2.0,2.0,2.0,42.0,0.0,0.0,0.0,0.0,...,55418.0,50581.0,49434.5,5020.0,5000.0,4561.0,4089.0,4029.5,4008.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [102]:
df.limit_bal.unique()

array([ 350000,   50000,   20000,  110000,  240000,  360000,  160000,
         90000,   80000,  310000,   30000,  140000,  120000,  200000,
        230000,  300000,  180000,   60000,  100000,  500000,  210000,
        440000,  320000,  280000,  450000,  460000,  390000,  260000,
        250000,  130000,  290000,  170000,   70000,  400000,  340000,
        150000,  420000,   10000,  190000,  270000,  480000,  330000,
         40000,  220000,  490000,  430000,  410000,  580000,  380000,
        610000,  370000,  800000,  470000,  600000,  520000,  510000,
        660000,  710000,  700000,  590000,  570000,  530000,  560000,
        550000,  630000,  750000,  680000,  620000,  640000,  540000,
         16000,  650000,  740000,  670000,  720000,  690000, 1000000,
        730000,  780000,  327680])

In [9]:
df.pay_0.unique()

array([-2,  2, -1,  0,  1,  3,  8,  5,  4,  6,  7])

In [11]:
df.pay_2.unique()

array([-2,  2, -1,  0,  3,  1,  4,  7,  8,  6,  5])

In [12]:
df.pay_3.unique()

array([-2,  2, -1,  0,  3,  6,  5,  4,  7,  1,  8])

In [13]:
df.pay_4.unique()

array([-2,  0, -1,  2,  3,  5,  4,  7,  6,  1,  8])

In [14]:
df.pay_5.unique()

array([-2,  0, -1,  2,  3,  4,  7,  5,  6,  8])

In [15]:
df.pay_6.unique()

array([-2,  0, -1,  2,  3,  4,  7,  6,  5,  8])

So 5 and 6 are missing value '1' and only have 10 unique values, where as pay 0-4 columns have 11 unique values... not sure what the extra category is...

In [23]:
(df['pay_6'] == -2).sum()

3878

In [24]:
(df['pay_6'] == 8).sum()

2

In [25]:
(df['pay_5'] == 8).sum()

1

In [26]:
(df['pay_6'] == 0).sum()

13096

In [27]:
(df['pay_6'] == -1).sum()

4560

In [36]:
for num in range(-2, 9):
    print((df['pay_3'] == num).sum())

3246
4776
12594
3
3086
181
57
15
18
20
3


In [50]:
for num in [0, 2, 3, 4]:
    print((df['pay_' + str(num)] == 1).sum())

2942
19
3
1


If we're thinking the values are still 'in order' then category '1' is '3 months late'.  These are all quite small values...

So it looks like the docs are potentially wrong - that the payment values don't go from -1 to 9 but -2 to 8.  For simplicity we'll match with what the data says.

Check education values:

In [16]:
df.education.unique()

array([1, 2, 3, 5, 6, 4, 0])

So education has value 0 which doesn't match up to anything in the docs so this must be 'na'...

In [18]:
(df['education'] == 0).sum()

11

So 11 'na' values potentially.. we could just drop these or change them to 'nan'.  

Category 5 is 'unknown' so it might make more sense to change 0 values to 5 instead since we don't know what they are.

Make sure other columns have values that make sense:

In [20]:
df.sex.unique()

array([1, 2])

In [21]:
df.marriage.unique()

array([2, 1, 3, 0])

Marriage also has 0 values... perhaps group with '3' category but let's check how many...

In [22]:
(df['marriage'] == 0).sum()

43

same for marriage.. there’s an extra category ‘0’.  Not sure if in this case it’s ok to put in ‘others’ group 3 since these could be married or single, they just didn’t answer, or maybe we just create a new category of ‘unknown’

### Decision about values:

After discussing with Matt, we've decided to deal with the values as follows:

- sex:  
    - keep as 1/2 values
    - change to category type
- education:  
    - drop 0 values since there are only 11 of these
    - combine categories 5 and 6 since they are both unknown
    - Change to category type
- marriage:  
    - drop 0 values since there are only 43
    - change to category type
- pay_: 
    - we make the assumption that even though they're different numbers, they are on the same scale/order as what's presented in the docs
    - we're not sure why pay_0-4 has an extra category ('1') which 'should' correspond to 3 months delay payment...
    - we're going to keep values as is as it shouldn't affect the model too much, it will just affect our ability to interpret exactly what the columns mean.  If we had more time we would dive deeper on this
    - change values to strings so they can be dummied out

#### Sex:

In [51]:
df.sex = df.sex.astype('category')

#### Education:

In [57]:
df[df['education'] == 0]

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default payment next month
4544,6877,360000,1,0,2,30,0,0,-1,0,...,34345,36777,30,23000,12280,25007,25008,1767,3300,0
5695,14632,350000,2,0,2,53,-1,-1,-1,-1,...,22611,1385,6043,4840,61349,22687,1389,6058,1153,0
6461,17415,230000,2,0,2,47,-1,-1,-1,2,...,255,5425,4838,5743,1598,0,5425,4838,3840,0
9262,23235,220000,2,0,1,35,-2,-2,-2,-2,...,319,319,319,319,10567,319,319,319,2420,0
15561,5946,270000,1,0,2,39,1,-1,-1,-1,...,18607,0,0,10193,70213,19008,399,0,0,0
15736,15108,210000,1,0,2,45,-2,-2,-2,-2,...,788,3499,3372,5854,1032,788,3565,3372,15381,0
16280,20031,200000,2,0,2,30,-1,-1,2,-1,...,9470,5816,7809,2880,0,9470,5834,7809,2886,0
16394,27271,250000,1,0,1,35,-2,-2,-2,-2,...,33604,0,1190,7783,12046,33718,0,1190,590,0
16927,27156,160000,1,0,1,47,-1,-1,-1,-1,...,6987,3853,4613,907,3707,6991,77,4613,4099,0
17206,3770,290000,2,0,2,38,1,-1,-1,-1,...,1406,2196,1481,1437,3078,1406,2196,1481,0,0


In [58]:
df = df[df.education != 0]

check 11 rows got dropped:

In [61]:
df.shape

(23988, 25)

Combine 5 and 6 categories:

In [65]:
df.education.replace(6, 5)

0        1
1        2
2        1
3        3
4        2
        ..
23994    1
23995    3
23996    3
23997    2
23998    1
Name: education, Length: 23988, dtype: int64

In [66]:
df.education = df.education.replace(6, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [67]:
df.education.unique()

array([1, 2, 3, 5, 4])

In [68]:
df.education = df.education.astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23988 entries, 0 to 23998
Data columns (total 25 columns):
id                            23988 non-null int64
limit_bal                     23988 non-null int64
sex                           23988 non-null category
education                     23988 non-null category
marriage                      23988 non-null int64
age                           23988 non-null int64
pay_0                         23988 non-null int64
pay_2                         23988 non-null int64
pay_3                         23988 non-null int64
pay_4                         23988 non-null int64
pay_5                         23988 non-null int64
pay_6                         23988 non-null int64
bill_amt1                     23988 non-null int64
bill_amt2                     23988 non-null int64
bill_amt3                     23988 non-null int64
bill_amt4                     23988 non-null int64
bill_amt5                     23988 non-null int64
bill_amt6         

#### Marriage:

Drop 0 values:

In [71]:
df = df[df.marriage != 0]

In [73]:
df.shape

(23945, 25)

Change to category type:

In [74]:
df.marriage = df.marriage.astype('category')

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23945 entries, 0 to 23998
Data columns (total 25 columns):
id                            23945 non-null int64
limit_bal                     23945 non-null int64
sex                           23945 non-null category
education                     23945 non-null category
marriage                      23945 non-null category
age                           23945 non-null int64
pay_0                         23945 non-null int64
pay_2                         23945 non-null int64
pay_3                         23945 non-null int64
pay_4                         23945 non-null int64
pay_5                         23945 non-null int64
pay_6                         23945 non-null int64
bill_amt1                     23945 non-null int64
bill_amt2                     23945 non-null int64
bill_amt3                     23945 non-null int64
bill_amt4                     23945 non-null int64
bill_amt5                     23945 non-null int64
bill_amt6      

#### Pay_*:

Change to string type:

In [77]:
# create list of numbers for columns:
col_nums = [0, 2, 3, 4, 5, 6]
for num in col_nums:
    # change to string 
    df['pay_' + str(num)] = df['pay_' + str(num)].astype('str')
    # change to cateory type
    df['pay_' + str(num)] = df['pay_' + str(num)].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23945 entries, 0 to 23998
Data columns (total 25 columns):
id                            23945 non-null int64
limit_bal                     23945 non-null int64
sex                           23945 non-null category
education                     23945 non-null category
marriage                      23945 non-null category
age                           23945 non-null int64
pay_0                         23945 non-null category
pay_2                         23945 non-null category
pay_3                         23945 non-null category
pay_4                         23945 non-null category
pay_5                         23945 non-null category
pay_6                         23945 non-null category
bill_amt1                     23945 non-null int64
bill_amt2                     23945 non-null int64
bill_amt3                     23945 non-null int64
bill_amt4                     23945 non-null int64
bill_amt5                     23945 non-null int

In [79]:
df.head()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default payment next month
0,2873,350000,1,1,2,37,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,3598,50000,2,2,1,37,2,2,2,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,27623,50000,2,1,2,23,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,6874,20000,1,3,1,56,0,0,0,0,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,6444,110000,2,2,2,32,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0


### Create reference dict:

In [80]:
reference = {'sex': {1: 'male', 2: 'female'},
            'education': {1: 'graduate school', 2: 'university', 3: 'high school', 4: 'others', 5: 'unknown'},
            'marriage': {1: 'married', 2: 'single', 3: 'others'},
            'pay_': {'-2': 'pay duly', '-1': 'payment delay for 1 month', '0': 'payment delay for 2 months', 
                    '1': 'payment delay for 3 months', '2': 'payment delay for 4 months', '3': 'payment delay for 5 months', 
                    '4': 'payment delay for 6 months', '5': 'payment delay for 7 months', '6': 'payment deplay for 8 months', 
                    '7': 'payment delay for 9 months and above', '8': 'no fuckin clue'}}

## Target variable:

In [85]:
df['default payment next month'].value_counts()

0    18640
1     5305
Name: default payment next month, dtype: int64

So we have class imbalance issues so we will have to balance this.

Let's update the columns name with '_':

In [92]:
df.columns = df.columns.str.replace(' ', '_')

In [93]:
df.head()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
0,2873,350000,1,1,2,37,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,3598,50000,2,2,1,37,2,2,2,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,27623,50000,2,1,2,23,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,6874,20000,1,3,1,56,0,0,0,0,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,6444,110000,2,2,2,32,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0


## Prepare for modeling:

### Get X and y variables:

In [94]:
X = df.drop('default_payment_next_month', axis = 1)

In [96]:
y = df['default_payment_next_month']

### Split the data:

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

### One Hot Encode:

In [116]:
ohe = OneHotEncoder(handle_unknown = 'ignore')

In [117]:
ohe_cols = ['sex', 'education', 'marriage', 'pay_0', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']
ohe.fit(X_train[ohe_cols])

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [118]:
ohe.categories_

[array([1, 2]),
 array([1, 2, 3, 4, 5]),
 array([1, 2, 3]),
 array(['-1', '-2', '0', '1', '2', '3', '4', '5', '6', '7', '8'],
       dtype=object),
 array(['-1', '-2', '0', '1', '2', '3', '4', '5', '6', '7'], dtype=object),
 array(['-1', '-2', '0', '1', '2', '3', '4', '5', '6', '7', '8'],
       dtype=object),
 array(['-1', '-2', '0', '1', '2', '3', '4', '5', '6', '7'], dtype=object),
 array(['-1', '-2', '0', '2', '3', '4', '5', '6', '7'], dtype=object),
 array(['-1', '-2', '0', '2', '3', '4', '5', '6', '7'], dtype=object)]

In [None]:
# Make a dataframe with the encoded cols
train_encoded = pd.DataFrame(ohe.transform(X_train[ohe_cols]).todense(), index = X_train[ohe_cols].index, 
                             columns = ohe.get_feature_names())

# new encoded df
X_train_encode = pd.concat([X_train.drop(ohe_cols, axis=1), train_encoded], axis=1)

In [None]:
X_train_encode

In [127]:
X_train_encode.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17958 entries, 362 to 23708
Data columns (total 85 columns):
id           17958 non-null int64
limit_bal    17958 non-null int64
age          17958 non-null int64
bill_amt1    17958 non-null int64
bill_amt2    17958 non-null int64
bill_amt3    17958 non-null int64
bill_amt4    17958 non-null int64
bill_amt5    17958 non-null int64
bill_amt6    17958 non-null int64
pay_amt1     17958 non-null int64
pay_amt2     17958 non-null int64
pay_amt3     17958 non-null int64
pay_amt4     17958 non-null int64
pay_amt5     17958 non-null int64
pay_amt6     17958 non-null int64
x0_1         17958 non-null float64
x0_2         17958 non-null float64
x1_1         17958 non-null float64
x1_2         17958 non-null float64
x1_3         17958 non-null float64
x1_4         17958 non-null float64
x1_5         17958 non-null float64
x2_1         17958 non-null float64
x2_2         17958 non-null float64
x2_3         17958 non-null float64
x3_-1        17958 

In [149]:
# instantiate OneHotEncoder object
ohe = OneHotEncoder(categories = 'auto', handle_unknown = 'ignore')

# make copy of training data
X_train2 = X_train.copy()

# fit transform training data
# X_train2[numeric] = scaler.fit_transform(X_train[numeric])
train_encoded2 = ohe.fit_transform(X_train[ohe_cols])

# Create dataframe from encoded data
encoded_df2 = pd.DataFrame(train_encoded2.todense(), columns = ohe.get_feature_names(ohe_cols), index = X_train[ohe_cols].index)

# combine categorical df and scaled numeric data
X_train_encoded = pd.concat([X_train2.drop(ohe_cols, axis = 1), encoded_df2], axis=1)

In [148]:
ohe.get_feature_names(ohe_cols)

array(['sex_1', 'sex_2', 'education_1', 'education_2', 'education_3',
       'education_4', 'education_5', 'marriage_1', 'marriage_2',
       'marriage_3', 'pay_0_-1', 'pay_0_-2', 'pay_0_0', 'pay_0_1',
       'pay_0_2', 'pay_0_3', 'pay_0_4', 'pay_0_5', 'pay_0_6', 'pay_0_7',
       'pay_0_8', 'pay_2_-1', 'pay_2_-2', 'pay_2_0', 'pay_2_1', 'pay_2_2',
       'pay_2_3', 'pay_2_4', 'pay_2_5', 'pay_2_6', 'pay_2_7', 'pay_3_-1',
       'pay_3_-2', 'pay_3_0', 'pay_3_1', 'pay_3_2', 'pay_3_3', 'pay_3_4',
       'pay_3_5', 'pay_3_6', 'pay_3_7', 'pay_3_8', 'pay_4_-1', 'pay_4_-2',
       'pay_4_0', 'pay_4_1', 'pay_4_2', 'pay_4_3', 'pay_4_4', 'pay_4_5',
       'pay_4_6', 'pay_4_7', 'pay_5_-1', 'pay_5_-2', 'pay_5_0', 'pay_5_2',
       'pay_5_3', 'pay_5_4', 'pay_5_5', 'pay_5_6', 'pay_5_7', 'pay_6_-1',
       'pay_6_-2', 'pay_6_0', 'pay_6_2', 'pay_6_3', 'pay_6_4', 'pay_6_5',
       'pay_6_6', 'pay_6_7'], dtype=object)

In [150]:
train_encoded2

<17958x70 sparse matrix of type '<class 'numpy.float64'>'
	with 161622 stored elements in Compressed Sparse Row format>

In [151]:
encoded_df2

Unnamed: 0,sex_1,sex_2,education_1,education_2,education_3,education_4,education_5,marriage_1,marriage_2,marriage_3,...,pay_5_7,pay_6_-1,pay_6_-2,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7
362,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13187,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9244,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3877,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16540,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21627,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5403,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
862,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15835,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
X_train_encoded

Unnamed: 0,id,limit_bal,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,...,pay_5_7,pay_6_-1,pay_6_-2,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7
362,25777,60000,31,57092,58173,59626,55775,28187,28978,3000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
13187,18898,340000,29,36533,33689,8108,19291,35882,18924,3000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9244,17660,30000,44,28864,30108,29367,28605,27855,29649,2000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3877,11530,80000,42,0,0,0,0,0,7010,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16540,5903,50000,30,29994,30756,23618,26009,24731,46882,2000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21627,11672,110000,56,46335,45319,50030,48940,48026,55979,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5403,17671,230000,39,63370,64621,65380,66520,66343,63498,2305,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
862,4855,200000,27,11912,19196,11938,15972,17952,5925,14443,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
15835,15091,290000,34,0,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17958 entries, 362 to 23708
Data columns (total 85 columns):
id             17958 non-null int64
limit_bal      17958 non-null int64
age            17958 non-null int64
bill_amt1      17958 non-null int64
bill_amt2      17958 non-null int64
bill_amt3      17958 non-null int64
bill_amt4      17958 non-null int64
bill_amt5      17958 non-null int64
bill_amt6      17958 non-null int64
pay_amt1       17958 non-null int64
pay_amt2       17958 non-null int64
pay_amt3       17958 non-null int64
pay_amt4       17958 non-null int64
pay_amt5       17958 non-null int64
pay_amt6       17958 non-null int64
sex_1          17958 non-null float64
sex_2          17958 non-null float64
education_1    17958 non-null float64
education_2    17958 non-null float64
education_3    17958 non-null float64
education_4    17958 non-null float64
education_5    17958 non-null float64
marriage_1     17958 non-null float64
marriage_2     17958 non-null float64
marriag

### Get dummies instead...

In [122]:
X = pd.get_dummies(X, drop_first = True)

In [124]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23945 entries, 0 to 23998
Data columns (total 80 columns):
id             23945 non-null int64
limit_bal      23945 non-null int64
age            23945 non-null int64
bill_amt1      23945 non-null int64
bill_amt2      23945 non-null int64
bill_amt3      23945 non-null int64
bill_amt4      23945 non-null int64
bill_amt5      23945 non-null int64
bill_amt6      23945 non-null int64
pay_amt1       23945 non-null int64
pay_amt2       23945 non-null int64
pay_amt3       23945 non-null int64
pay_amt4       23945 non-null int64
pay_amt5       23945 non-null int64
pay_amt6       23945 non-null int64
sex_2          23945 non-null uint8
education_2    23945 non-null uint8
education_3    23945 non-null uint8
education_4    23945 non-null uint8
education_5    23945 non-null uint8
marriage_2     23945 non-null uint8
marriage_3     23945 non-null uint8
pay_0_-2       23945 non-null uint8
pay_0_0        23945 non-null uint8
pay_0_1        23945 non-nu

In [None]:
# balancing 
minority = df[df['label'] == 'spam']
undersampled_majority = df[df['label'] == 'ham'].sample(n=len(minority))
df2 = pd.concat([minority, undersampled_majority])
df2.label.value_counts()

# FSM:

DecisionTreeClassifier with max_depth = 5, max_features = 10

In [158]:
kf = KFold(n_splits = 5, random_state = 15)
    
    # create list to add recall scores
validation_f1 = []
    
for train_ind, val_ind in kf.split(X_train_encoded, y_train):
    X_t, y_t = X_train_encoded.iloc[train_ind], y_train.iloc[train_ind]
    X_val, y_val = X_train_encoded.iloc[val_ind], y_train.iloc[val_ind]
        
        # instantiate and fit/transform scaler
    scaler = StandardScaler()
    X_t_sc = scaler.fit_transform(X_t)
    X_val_sc = scaler.transform(X_val)
        
        # instantiate and fit SMOTE:
    smote = SMOTE(random_state = 15)
    X_t_resampled, y_t_resampled = smote.fit_resample(X_t_sc, y_t)
        
        # fit model to X_t_resampled:
    model = DecisionTreeClassifier(max_depth = 5, random_state = 42, max_features = 10)
    model.fit(X_t_resampled, y_t_resampled)
        
        # append recall score to validation recall list:
    validation_f1.append(f1_score(y_val, model.predict(X_val_sc)))
        
print(f"Validation f1 scores: {validation_f1}")
print(f"Mean f1 score:  {np.mean(validation_f1)}")

Validation f1 scores: [0.4516584333098094, 0.4442877291960507, 0.49118046132971505, 0.509478672985782, 0.504684572142411]
Mean f1 score:  0.4802579737927536


In [173]:
def scale_balance_model(X_train, y_train, model, scaler = StandardScaler(), balance = 'SMOTE', pca = None):
    # create kfolds object
    kf = KFold(n_splits = 5, random_state = 15)
    
    # create list to add recall scores
    validation_f1 = []
    pca = pca
    
    for train_ind, val_ind in kf.split(X_train, y_train):
        X_t, y_t = X_train.iloc[train_ind], y_train.iloc[train_ind]
        X_val, y_val = X_train.iloc[val_ind], y_train.iloc[val_ind]
        
        # instantiate and fit/transform scaler
        scaler = scaler
        X_t_sc = scaler.fit_transform(X_t)
        X_val_sc = scaler.transform(X_val)
        
        # instantiate and fit SMOTE:
        if balance == 'SMOTE':
            smote = SMOTE(random_state = 15)
            X_t_resampled, y_t_resampled = smote.fit_resample(X_t_sc, y_t)
        else:
            print("don't know how to implement different balancing here lol")
        
        if pca != None:
            pca.fit(X_t_resampled)

            # Your code here
            X_t_pca = pca.transform(X_t_resampled)


            # Create a dataframe from this array of transformed features 
            X_t_pca = pd.DataFrame(X_t_pca)
            
            

            # transform pca on scaled test data:
            X_val_pca = pca.transform(X_val_sc)
        
        if pca != None:
            model.fit(X_t_pca, y_t_resampled)
            validation_f1.append(f1_score(y_val, model.predict(X_val_pca)))
        else:
            model.fit(X_t_resampled, y_t_resampled)
            validation_f1.append(f1_score(y_val, model.predict(X_val_sc)))
        
        
    print(f"Validation f1 scores: {validation_f1}")
    print(f"Mean f1 score:  {np.mean(validation_f1)}")

In [155]:
df['default_payment_next_month'].value_counts()

0    18640
1     5305
Name: default_payment_next_month, dtype: int64

In [156]:
minority = df[df['default_payment_next_month'] == 1]
undersampled_majority = df[df['default_payment_next_month'] == 0].sample(n=len(minority))
df2 = pd.concat([minority, undersampled_majority])
df2

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
18691,18968,480000,2,2,2,30,0,0,-2,-2,...,0,0,330982,0,0,0,0,330982,30000,1
18692,27657,50000,2,2,2,22,0,0,0,0,...,50021,50352,50740,1760,1888,3464,1717,1766,2500,1
18693,29818,50000,1,2,2,40,2,0,0,2,...,13444,13367,13282,2000,2000,1000,1000,1000,2000,1
18694,10440,330000,2,2,1,27,1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,1
18695,19443,360000,2,2,2,37,-1,-1,-1,-1,...,333,0,333,333,5043,333,0,333,210,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16423,19119,80000,2,3,2,42,0,0,0,0,...,49071,48242,47746,1784,1898,1985,1629,1655,1687,0
4624,11959,480000,2,2,2,48,-1,-1,-1,-1,...,7621,1735,302818,1000,3652,7626,1740,302823,19608,0
1264,26872,290000,1,1,1,45,-2,-2,-2,-2,...,9746,7092,5449,2143,2485,9793,7123,5460,300,0
4751,17527,290000,2,2,2,34,0,0,0,0,...,314085,317631,315820,12012,12012,12012,11016,11088,12031,0


# Model Iterations:

## Model 2:

In [166]:
scale_balance_model(X_train_encoded, y_train, GradientBoostingClassifier(random_state = 42), scaler = StandardScaler(), balance = 'SMOTE')

Validation f1 scores: [0.49667110519307595, 0.5006747638326586, 0.534017971758665, 0.5033112582781456, 0.533582089552239]
Mean f1 score:  0.5136514377229568
Confusion Matrix: 
 [[2412  341]
 [ 409  429]]


In [None]:
wine_pca = PCA(n_components = 0.9, random_state = 42)

ss = StandardScaler()

# fit scaler on training data:
X_train_ss = ss.fit_transform(X_train)

# fit pca on scaled training data:
wine_pca.fit(X_train_ss)

wine_pca.n_components_

# Your code here
X_train_pca = wine_pca.transform(X_train_ss)


# Create a dataframe from this array of transformed features 
X_train_pca = pd.DataFrame(X_train_pca)


# Your code here 
# fit scaler on test data:
X_test_ss = ss.transform(X_test)

# transform pca on scaled test data:
X_test_pca = wine_pca.transform(X_test_ss)

## Model 3:

In [174]:
scale_balance_model(X_train_encoded, y_train, GradientBoostingClassifier(random_state = 42), pca = PCA(n_components = 0.9, random_state = 42))

Validation f1 scores: [0.5008025682182985, 0.5060506050605061, 0.5387618516452871, 0.5278236914600551, 0.5482340537691092]
Mean f1 score:  0.5243345540306512


## Model 4:

In [175]:
rfc = RandomForestClassifier(n_estimators = 10, random_state = 42)
scale_balance_model(X_train_encoded, y_train, rfc, pca = PCA(n_components = 0.9, random_state = 42))

Validation f1 scores: [0.43039591315453385, 0.46737683089214377, 0.45525291828793774, 0.46252402306213963, 0.5]
Mean f1 score:  0.46310993707935105


## Model 5:

In [176]:
rfc = RandomForestClassifier(n_estimators = 10, random_state = 42)
scale_balance_model(X_train_encoded, y_train, rfc)

Validation f1 scores: [0.43551797040169127, 0.4535367545076283, 0.46015246015246014, 0.46587926509186356, 0.49835201054713246]
Mean f1 score:  0.46268769214015515


## Model 6:

In [177]:
scale_balance_model(X_train_encoded, y_train, GradientBoostingClassifier(learning_rate = 1, max_depth = 5, random_state = 42), pca = PCA(n_components = 0.9, random_state = 42))

Validation f1 scores: [0.411829134720701, 0.43579766536964976, 0.44504896626768226, 0.4291845493562232, 0.4450373532550694]
Mean f1 score:  0.4333795337938652


## Model 7:

In [178]:
scale_balance_model(X_train_encoded, y_train, GradientBoostingClassifier(learning_rate = 0.15, max_depth = 5, n_estimators = 250, max_features = 'auto'), pca = PCA(n_components = 0.9, random_state = 42))

Validation f1 scores: [0.4734133790737564, 0.4753623188405797, 0.488558352402746, 0.5002875215641173, 0.503102086858432]
Mean f1 score:  0.4881447317479263


## Model 8:

In [None]:


validation_f1 = []
    pca = pca
    
    for train_ind, val_ind in kf.split(X_train, y_train):
        X_t, y_t = X_train.iloc[train_ind], y_train.iloc[train_ind]
        X_val, y_val = X_train.iloc[val_ind], y_train.iloc[val_ind]
        
        # instantiate and fit/transform scaler
        scaler = scaler
        X_t_sc = scaler.fit_transform(X_t)
        X_val_sc = scaler.transform(X_val)
        
        # instantiate and fit SMOTE:
        if balance == 'SMOTE':
            smote = SMOTE(random_state = 15)
            X_t_resampled, y_t_resampled = smote.fit_resample(X_t_sc, y_t)
        else:
            print("don't know how to implement different balancing here lol")
        
        if pca != None:
            pca.fit(X_t_resampled)

            # Your code here
            X_t_pca = pca.transform(X_t_resampled)


            # Create a dataframe from this array of transformed features 
            X_t_pca = pd.DataFrame(X_t_pca)
            
            

            # transform pca on scaled test data:
            X_val_pca = pca.transform(X_val_sc)
        
        if pca != None:
            model.fit(X_t_pca, y_t_resampled)
            validation_f1.append(f1_score(y_val, model.predict(X_val_pca)))
        else:
            model.fit(X_t_resampled, y_t_resampled)
            validation_f1.append(f1_score(y_val, model.predict(X_val_sc)))
        
        
    print(f"Validation f1 scores: {validation_f1}")
    print(f"Mean f1 score:  {np.mean(validation_f1)}")