# Prediciting Credit Card Approvals Using RandomForest
고객들의 여러가지 특성(많은 대출, 낮은 수입, 너무 많은 리포트) 등등의 특징들로부터 Card Approval 여부를 예측하는 프로젝트이다.

### Data Preprocessing

- missing data는 `NaN` 값으로 대체
- missing continuous values 는  column의 mean으로 대체
- missing categorical values 는 column의 mode으로 대체
- 표준편차 방법을 사용해서 Remove outliers 
- One Hot Encoding of Categorical features

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [4]:
df = pd.read_csv('cc_approvals.data', header=None)

In [5]:
# replace missing values with NaN value
df = df.replace('?', np.nan)
df.describe()

Unnamed: 0,2,7,10,14
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [7]:
# features 2, 7, 10, and 14 are continuous.
df.count()

0     678
1     678
2     690
3     684
4     684
5     681
6     681
7     690
8     690
9     690
10    690
11    690
12    690
13    677
14    690
15    690
dtype: int64

In [9]:
# 10 samples 해서 error가 하나 이상나오면 제거
nan_vals = dict(df.count(axis=1))
nan_vals = {key:value for (key,value) in nan_vals.items() if value < 15}
nan_vals

{206: 11,
 270: 11,
 330: 11,
 445: 14,
 456: 11,
 479: 13,
 539: 14,
 592: 11,
 601: 13,
 622: 11}

In [10]:
# drop the erroneous records from the data frame
df = df.drop(index=nan_vals.keys())

In [11]:
len(df)

680

In [13]:
# missing categorical values를 colum의 mode 값으로 채움\
fill = pd.Series(df.mode().values.flatten())
df = df.fillna(fill)

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [15]:
df[1] = df[1].astype('float64')
df[13] = df[13].astype('int64')

In [16]:
df.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13      int64
14      int64
15     object
dtype: object

In [17]:
df.describe()

Unnamed: 0,1,2,7,10,13,14
count,680.0,680.0,680.0,680.0,680.0,680.0
mean,31.309824,4.797515,2.255184,2.435294,182.864706,1024.198529
std,11.718569,4.986291,3.360643,4.889825,174.085346,5244.64216
min,13.75,0.0,0.0,0.0,0.0,0.0
25%,22.67,1.0,0.165,0.0,70.0,0.0
50%,28.17,2.855,1.0,0.0,160.0,5.0
75%,37.5625,7.5,2.75,3.0,274.5,400.0
max,76.75,28.0,28.5,67.0,2000.0,100000.0


In [18]:
# mean 으로 부터 std * 3 인 outlier들 제거

df[(df[1] < 3*11.719 + 31.310) &
   (df[2] < 3*4.986 + 4.798) & 
   (df[7] < 3*3.361 + 2.255) & 
   (df[10] < 3*4.890 + 2.435) &
   (df[13] < 3*174.085 + 182.865) &
   (df[14] < 3*5344.642 + 1024.199)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.000,u,g,w,v,1.250,t,t,1,f,g,202,0,+
1,a,58.67,4.460,u,g,q,h,3.040,t,t,6,f,g,43,560,+
2,a,24.50,0.500,u,g,q,h,1.500,t,f,0,f,g,280,824,+
3,b,27.83,1.540,u,g,w,v,3.750,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.710,t,f,0,f,s,120,0,+
5,b,32.08,4.000,u,g,m,v,2.500,t,f,0,t,g,360,0,+
7,a,22.92,11.585,u,g,cc,v,0.040,t,f,0,f,g,80,1349,+
8,b,54.42,0.500,y,p,k,h,3.960,t,f,0,f,g,180,314,+
9,b,42.50,4.915,y,p,w,v,3.165,t,f,0,t,g,52,1442,+
10,b,22.08,0.830,u,g,c,h,2.165,f,f,0,t,g,128,0,+


In [19]:
# one hot encoding

df = pd.get_dummies(df)
df.head()

Unnamed: 0,1,2,7,10,13,14,0_a,0_b,3_l,3_u,...,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s,15_+,15_-
0,30.83,0.0,1.25,1,202,0,0,1,0,1,...,1,0,1,1,0,1,0,0,1,0
1,58.67,4.46,3.04,6,43,560,1,0,0,1,...,1,0,1,1,0,1,0,0,1,0
2,24.5,0.5,1.5,0,280,824,1,0,0,1,...,1,1,0,1,0,1,0,0,1,0
3,27.83,1.54,3.75,5,100,3,0,1,0,1,...,1,0,1,0,1,1,0,0,1,0
4,20.17,5.625,1.71,0,120,0,0,1,0,1,...,1,1,0,1,0,0,0,1,1,0


In [20]:
# drop the duplicate output label column
df = df.drop(columns='15_-')
df.head()

Unnamed: 0,1,2,7,10,13,14,0_a,0_b,3_l,3_u,...,8_f,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s,15_+
0,30.83,0.0,1.25,1,202,0,0,1,0,1,...,0,1,0,1,1,0,1,0,0,1
1,58.67,4.46,3.04,6,43,560,1,0,0,1,...,0,1,0,1,1,0,1,0,0,1
2,24.5,0.5,1.5,0,280,824,1,0,0,1,...,0,1,1,0,1,0,1,0,0,1
3,27.83,1.54,3.75,5,100,3,0,1,0,1,...,0,1,0,1,0,1,1,0,0,1
4,20.17,5.625,1.71,0,120,0,0,1,0,1,...,0,1,1,0,1,0,0,0,1,1


### RandomForest 

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

In [22]:
df = df.iloc[:,1:] # drop first row of index values
df = df.sample(frac=1) # shuffle rows
df.head()

Unnamed: 0,2,7,10,13,14,0_a,0_b,3_l,3_u,3_y,...,8_f,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s,15_+
163,1.75,0.04,0,393,0,0,1,0,0,1,...,0,1,1,0,0,1,1,0,0,1
192,0.96,2.5,0,510,600,0,1,0,1,0,...,0,1,1,0,1,0,1,0,0,1
50,0.5,0.875,0,491,0,1,0,0,1,0,...,0,1,1,0,0,1,1,0,0,1
608,0.04,4.25,0,460,0,0,1,0,0,1,...,1,0,1,0,0,1,1,0,0,0
403,0.335,0.75,0,160,0,1,0,0,1,0,...,1,0,1,0,1,0,0,0,1,0


In [23]:
# split data into its X and y components
X, y = df.values[:,:-1], df.values[:,-1]

In [24]:
clf = RandomForestClassifier(n_estimators=50)

In [25]:
folds = len(X)
kf = KFold(n_splits=folds)

ave = 0

for (train_index, test_index) in kf.split(X):
    clf.fit(X[train_index], y[train_index])
    ave += clf.score(X[test_index], y[test_index])
    
ave /= folds

### Accuracy

In [26]:
print('Random Forest Model w/ LOOCV:', ave)


Random Forest Model w/ LOOCV: 0.8794117647058823
