In [4]:
import random
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# frameworks for ML
from sklearn.pipeline import make_pipeline


# transformers for category variables
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


# transformers for numerical variables
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer


# transformers for combined variables
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures


# user-defined transformers
from sklearn.preprocessing import FunctionTransformer


# classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [63]:
testdata = pd.DataFrame({'pet':['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                         'age': [4., 6, 3, 3, 2, 3, 5, 4],
                         'salary':   [90, 24, 44, 27, 32, 59, 36, 27]})
testdata

Unnamed: 0,age,pet,salary
0,4.0,cat,90
1,6.0,dog,24
2,3.0,dog,44
3,3.0,fish,27
4,2.0,cat,32
5,3.0,dog,59
6,5.0,cat,36
7,4.0,fish,27


In [64]:
testdata = pd.get_dummies(testdata)
testdata

Unnamed: 0,age,salary,pet_cat,pet_dog,pet_fish
0,4.0,90,1,0,0
1,6.0,24,0,1,0
2,3.0,44,0,1,0
3,3.0,27,0,0,1
4,2.0,32,1,0,0
5,3.0,59,0,1,0
6,5.0,36,1,0,0
7,4.0,27,0,0,1


In [65]:
testdata = pd.DataFrame({'pet':['cat', 'dog,,,cat', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
                         'age': [4., 6, 3, 3, 2, 3, 5, 4],
                         'salary':   [90, 24, 44, 27, 32, 59, 36, 27]})
testdata

Unnamed: 0,age,pet,salary
0,4.0,cat,90
1,6.0,"dog,,,cat",24
2,3.0,dog,44
3,3.0,fish,27
4,2.0,cat,32
5,3.0,dog,59
6,5.0,cat,36
7,4.0,fish,27


In [66]:
se = testdata['pet']
dum = se.str.get_dummies(sep=',,,')
dum

Unnamed: 0,cat,dog,fish
0,1,0,0
1,1,1,0
2,0,1,0
3,0,0,1
4,1,0,0
5,0,1,0
6,1,0,0
7,0,0,1


Unnamed: 0,age,pet,salary
0,4.0,cat,90
1,6.0,"dog,,,cat",24
2,3.0,dog,44
3,3.0,fish,27
4,2.0,cat,32
5,3.0,dog,59
6,5.0,cat,36
7,4.0,fish,27


In [50]:
pd.get_dummies(testdata)

Unnamed: 0,age,salary,"pet,,,cat","pet,,,dog","pet,,,dog,,,cat","pet,,,fish"
0,4.0,90,1,0,0,0
1,6.0,24,0,0,1,0
2,3.0,44,0,1,0,0
3,3.0,27,0,0,0,1
4,2.0,32,1,0,0,0
5,3.0,59,0,1,0,0
6,5.0,36,1,0,0,0
7,4.0,27,0,0,0,1


In [7]:
testdata.as_matrix()

array([[4.0, 'cat', 90],
       [6.0, 'dog', 24],
       [3.0, 'dog', 44],
       [3.0, 'fish', 27],
       [2.0, 'cat', 32],
       [3.0, 'dog', 59],
       [5.0, 'cat', 36],
       [4.0, 'fish', 27]], dtype=object)

In [9]:
testdata.values

array([[4.0, 'cat', 90],
       [6.0, 'dog', 24],
       [3.0, 'dog', 44],
       [3.0, 'fish', 27],
       [2.0, 'cat', 32],
       [3.0, 'dog', 59],
       [5.0, 'cat', 36],
       [4.0, 'fish', 27]], dtype=object)

In [37]:
train_file = './data/loan_approval.csv'
train_data = pd.read_csv(train_file)
train_data

Unnamed: 0,ID,Age,Work,House,loan,approval
0,1,Young,No,No,normal,No
1,2,Young,No,No,Fair,No
2,3,Young,Yes,No,Fair,Yes
3,4,Young,Yes,Yes,normal,Yes
4,5,Young,No,No,normal,No
5,6,Middle,No,No,normal,No
6,7,Middle,No,No,Fair,No
7,8,Middle,Yes,Yes,Fair,Yes
8,9,Middle,No,Yes,perfect,Yes
9,10,Middle,No,Yes,perfect,Yes


In [29]:
age_map = {'yound': 1, 'Middle': 2, 'Old': 3}
train_data['Age'] = train_data['Age'].map(age_map)
YN_map = {'No':0, 'Yes':1}
train_data['Work'] = train_data['Work'].map(YN_map)
train_data['House'] = train_data['House'].map(YN_map)
train_data['approval'] = train_data['approval'].map(YN_map)
loan_map = {'normal':0, 'good':1, "perfect":2}
train_data['loan'] = train_data['loan'].map(loan_map)
train_data

Unnamed: 0,ID,Age,Work,House,loan,approval
0,1,1,0,0,0,0
1,2,1,0,0,1,0
2,3,1,1,0,1,1
3,4,1,1,1,0,1
4,5,1,0,0,0,0
5,6,2,0,0,0,0
6,7,2,0,0,1,0
7,8,2,1,1,1,1
8,9,2,0,1,2,1
9,10,2,0,1,2,1


In [30]:
# 删除需要的列
train_data.drop(['ID'], axis=1, inplace=True)
train_data

Unnamed: 0,Age,Work,House,loan,approval
0,1,0,0,0,0
1,1,0,0,1,0
2,1,1,0,1,1
3,1,1,1,0,1
4,1,0,0,0,0
5,2,0,0,0,0
6,2,0,0,1,0
7,2,1,1,1,1
8,2,0,1,2,1
9,2,0,1,2,1


In [31]:
# 弹出Label列
label_data = train_data.pop('approval')
label_data

0     0
1     0
2     1
3     1
4     0
5     0
6     0
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    0
Name: approval, dtype: int64

In [32]:
train_data

Unnamed: 0,Age,Work,House,loan
0,1,0,0,0
1,1,0,0,1
2,1,1,0,1
3,1,1,1,0
4,1,0,0,0
5,2,0,0,0
6,2,0,0,1
7,2,1,1,1
8,2,0,1,2
9,2,0,1,2


In [38]:
pd.get_dummies(train_data)


Unnamed: 0,ID,Age_Middle,Age_Old,Age_Young,Work_No,Work_Yes,House_No,House_Yes,loan_Fair,loan_normal,loan_perfect,approval_No,approval_Yes
0,1,0,0,1,1,0,1,0,0,1,0,1,0
1,2,0,0,1,1,0,1,0,1,0,0,1,0
2,3,0,0,1,0,1,1,0,1,0,0,0,1
3,4,0,0,1,0,1,0,1,0,1,0,0,1
4,5,0,0,1,1,0,1,0,0,1,0,1,0
5,6,1,0,0,1,0,1,0,0,1,0,1,0
6,7,1,0,0,1,0,1,0,1,0,0,1,0
7,8,1,0,0,0,1,0,1,1,0,0,0,1
8,9,1,0,0,1,0,0,1,0,0,1,0,1
9,10,1,0,0,1,0,0,1,0,0,1,0,1
