Prepare the **Credit Approval data set**, to leave it more suitable for the demos of the recipes from chapter 3.

In [45]:
import random
import pandas as pd
import numpy as np

In [46]:
data = pd.read_csv('crx.csv')
data.head()

Unnamed: 0,b,30.83,0,u,g,w,v,1.25,t,t.1,01,f,g.1,00202,0.1,+
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360,0,+


In [47]:
data.shape 

(689, 16)

In [48]:
for col in data.columns:
    print(col, type(col))

b <class 'str'>
30.83 <class 'str'>
0 <class 'str'>
u <class 'str'>
g <class 'str'>
w <class 'str'>
v <class 'str'>
1.25 <class 'str'>
t <class 'str'>
t.1 <class 'str'>
01 <class 'str'>
f <class 'str'>
g.1 <class 'str'>
00202 <class 'str'>
0.1 <class 'str'>
+ <class 'str'>


**Create variable names according to UCI Machine Learning! Re-cast some variables to the correct types!**

In [49]:
varnames = ['A'+str(s) for s in range(1,17)]
data.columns = varnames
data = data.replace('?', np.nan)  # replace ? by np.nan
data['A2'] = data['A2'].astype('float')
data['A14'] = data['A14'].astype('float')
data['A16'] = data['A16'].map({'+':1, '-':0})  # encode target to binary
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
2,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,1


**Add more missing values to random positions! I will help with the demos of the recipes!**

In [50]:
random.seed(9001)
values = set([random.randint(0, len(data)) for p in range(0, 100)])
for var in ['A3', 'A8', 'A9', 'A10']:
    data.loc[values, var] = np.nan
data.isnull().sum()

A1     12
A2     12
A3     92
A4      6
A5      6
A6      9
A7      9
A8     92
A9     92
A10    92
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

**Save the data!**

In [51]:
data.to_csv('creditApprovalUCI.csv', index=False)

In [52]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
1,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
2,b,27.83,,u,g,w,v,,,,5,t,g,100.0,3,1
3,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1
4,b,32.08,4.0,u,g,m,v,2.5,t,f,0,t,g,360.0,0,1


In [53]:
cat_cols = [c for c in data.columns if data[c].dtypes=='O']
data[cat_cols].head()  # categorical variables

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
0,a,u,g,q,h,t,t,f,g
1,a,u,g,q,h,t,f,f,g
2,b,u,g,w,v,,,t,g
3,b,u,g,w,v,t,f,f,s
4,b,u,g,m,v,t,f,t,g


In [54]:
num_cols = [c for c in data.columns if data[c].dtypes!='O']
data[num_cols].head()  # numerical variables

Unnamed: 0,A2,A3,A8,A11,A14,A15,A16
0,58.67,4.46,3.04,6,43.0,560,1
1,24.5,0.5,1.5,0,280.0,824,1
2,27.83,,,5,100.0,3,1
3,20.17,5.625,1.71,0,120.0,0,1
4,32.08,4.0,2.5,0,360.0,0,1
