## 결측치
* 데이터가 없으면 sample을 drop
* 데이터가 없는 최소 갯수를 정해서 sample을 drop (데이터의 갯수가 별로 없을 때)
* 데이터가 거의 없는 feature를 drop (feature=column)
* 최빈값, 평균값으로 데이터를 채움

In [2]:
import pandas as pd
import numpy as np

In [3]:
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [17]:
# null값 확인
df.isnull().sum()
df.isnull().sum() / len(df) # 몇퍼센트의 값이 null이냐

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
location         1.0
dtype: float64

In [8]:
# 가장 쉬운 방법, null이 있으면 삭제
df_no_missing=df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
df_cleaned=df.dropna(how='all') # 모든 데이터가 null이면 drop
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [14]:
df['location']=np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [16]:
df.dropna(axis=1, thresh=4) # 값이 4개가 안되면 삭제

Unnamed: 0,first_name,last_name,age,sex
0,Jason,Miller,42.0,m
1,,,,
2,Tina,Ali,36.0,f
3,Jake,Milner,24.0,m
4,Amy,Cooze,73.0,f


In [28]:
df['preTestScore'].mean() # 평균값
df['preTestScore'].median() # 중위값
df['preTestScore'].mode() # 최빈값

0    3.0
dtype: float64

In [21]:
df.fillna(0) # null을 0으로 채움

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [23]:
df['preTestScore'].fillna(df['preTestScore'].mean(), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [32]:
# 성별을 group으로 묶어서 걔네들의 평균값으로 바꿈
df["postTestScore"].fillna(
    df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [33]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


## 이산형 데이터를 어떻게 처리할까?
* One_Hot Encoding

In [34]:
edges = pd.DataFrame({'source': [0, 1, 2],
                   'target': [2, 2, 3],
                       'weight': [3, 4, 5],
                       'color': ['red', 'blue', 'blue']})

edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [35]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [36]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [37]:
weight_dict = {3:"M", 4:"L", 5:"XL"}
edges["weight_sign"] = edges["weight"].map(weight_dict)
edges

Unnamed: 0,source,target,weight,color,weight_sign
0,0,2,3,red,M
1,1,2,4,blue,L
2,2,3,5,blue,XL


In [38]:
weight_sign = pd.get_dummies(edges["weight_sign"])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [40]:
pd.concat([edges, weight_sign], axis=1)

Unnamed: 0,source,target,weight,color,weight_sign,L,M,XL
0,0,2,3,red,M,0,1,0
1,1,2,4,blue,L,1,0,0
2,2,3,5,blue,XL,0,0,1


In [44]:
pd.get_dummies(edges).values

array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]], dtype=int64)

## Data Binning
* 구간별로 나눠줌

In [46]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [47]:
bins=[0,25,50,75,100]
group_names=['Low','Okay','Good','Great']
categories=pd.cut(df['postTestScore'],bins,labels=group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [48]:
df['categories'] = pd.cut(df['postTestScore'], bins, labels=group_names)
pd.value_counts(df['categories'])

Good     8
Great    2
Low      2
Okay     0
Name: categories, dtype: int64

In [49]:
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,categories
0,Nighthawks,1st,Miller,4,25,Low
1,Nighthawks,1st,Jacobson,24,94,Great
2,Nighthawks,2nd,Ali,31,57,Good
3,Nighthawks,2nd,Milner,2,62,Good
4,Dragoons,1st,Cooze,3,70,Good
5,Dragoons,1st,Jacon,4,25,Low
6,Dragoons,2nd,Ryaner,24,94,Great
7,Dragoons,2nd,Sone,31,57,Good
8,Scouts,1st,Sloan,2,62,Good
9,Scouts,1st,Piger,3,70,Good


In [51]:
del df['name']

In [52]:
pd.get_dummies(df)

Unnamed: 0,preTestScore,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scouts,company_1st,company_2nd,categories_Low,categories_Okay,categories_Good,categories_Great
0,4,25,0,1,0,1,0,1,0,0,0
1,24,94,0,1,0,1,0,0,0,0,1
2,31,57,0,1,0,0,1,0,0,1,0
3,2,62,0,1,0,0,1,0,0,1,0
4,3,70,1,0,0,1,0,0,0,1,0
5,4,25,1,0,0,1,0,1,0,0,0
6,24,94,1,0,0,0,1,0,0,0,1
7,31,57,1,0,0,0,1,0,0,1,0
8,2,62,0,0,1,1,0,0,0,1,0
9,3,70,0,0,1,1,0,0,0,1,0


## using scikit-learn preprocessing


In [59]:
raw_example = df.values
raw_example

array([['Nighthawks', '1st', 4, 25, 'Low'],
       ['Nighthawks', '1st', 24, 94, 'Great'],
       ['Nighthawks', '2nd', 31, 57, 'Good'],
       ['Nighthawks', '2nd', 2, 62, 'Good'],
       ['Dragoons', '1st', 3, 70, 'Good'],
       ['Dragoons', '1st', 4, 25, 'Low'],
       ['Dragoons', '2nd', 24, 94, 'Great'],
       ['Dragoons', '2nd', 31, 57, 'Good'],
       ['Scouts', '1st', 2, 62, 'Good'],
       ['Scouts', '1st', 3, 70, 'Good'],
       ['Scouts', '2nd', 2, 62, 'Good'],
       ['Scouts', '2nd', 3, 70, 'Good']], dtype=object)

In [55]:
data = raw_example.copy()

In [56]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [58]:
raw_example[:,0]

array(['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons',
       'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts',
       'Scouts'], dtype=object)

In [60]:
le.fit(raw_example[:,0])

LabelEncoder()

In [61]:
le.classes_

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [62]:
le.transform(raw_example[:,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [63]:
data[:,0] = le.transform(raw_example[:,0])
data[:3]

array([[1, '1st', 4, 25, 'Low'],
       [1, '1st', 24, 94, 'Great'],
       [1, '2nd', 31, 57, 'Good']], dtype=object)

In [65]:
label_column = [0,1,2,4]
label_enconder_list = []
for column_index in  label_column:
    le = preprocessing.LabelEncoder()
    le.fit(raw_example[:,column_index])
    data[:,column_index] = le.transform(raw_example[:,column_index])
    label_enconder_list.append(le)
    del le 
data[:3]

array([[1, 0, 2, 25, 2],
       [1, 0, 3, 94, 1],
       [1, 1, 4, 57, 0]], dtype=object)

In [66]:
label_enconder_list[0].transform(raw_example[:10,0])


array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2])

In [67]:
one_hot_enc = preprocessing.OneHotEncoder()
data[:,0].reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [69]:
one_hot_enc.fit(data[:,0].reshape(-1,1))


OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [70]:
onehotlabels = one_hot_enc.transform(data[:,0].reshape(-1,1)).toarray()
onehotlabels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

## Feature Scaling
* Min-Max Normalization
* Z-score Noramlization

### Min-Max Normalization

In [74]:
df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [75]:
df["A"] = ( df["A"] - df["A"].min() )  \
/ (df["A"].max() - df["A"].min()) * (5 - 1) + 1
df

Unnamed: 0,A,B,C
0,1.0,103.02,big
1,4.704874,107.26,small
2,4.741339,110.35,big
3,5.0,114.23,small
4,4.753981,114.68,small


###  Z-score Noramlization
* 평균을 빼고 표준편차로 나눠줌 -1.96~1.96 상하위 95% 정도

In [78]:
df["B"] = ( df["B"] - df["B"].mean() )  \
/ (df["B"].std() )
df

Unnamed: 0,A,B,C
0,1.0,-1.40525,big
1,4.704874,-0.54023,small
2,4.741339,0.090174,big
3,5.0,0.881749,small
4,4.753981,0.973556,small


In [83]:
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )

df.columns=['Class label', 'Alcohol', 'Malic acid']

df

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.20,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59
...,...,...,...
173,3,13.71,5.65
174,3,13.40,3.91
175,3,13.27,4.28
176,3,13.17,2.59


In [84]:
std_scaler = preprocessing.StandardScaler().fit(df[['Alcohol', 'Malic acid']])

In [86]:
df_std = std_scaler.transform(df[['Alcohol', 'Malic acid']])
df_std

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125],
       [ 1.69154964, -0.34681064],
       [ 0.29570023,  0.22769377],
       [ 1.48155459, -0.51736664],
       [ 1.71625494, -0.4186237 ],
       [ 1.3086175 , -0.16727801],
       [ 2.25977152, -0.62508622],
       [ 1.0615645 , -0.88540853],
       [ 1.3580281 , -0.15830138],
       [ 1.38273339, -0.76871232],
       [ 0.92568536, -0.54429654],
       [ 2.16095032, -0.54429654],
       [ 1.70390229, -0.4186237 ],
       [ 0.77745356, -0.47248348],
       [ 1.60508109, -0.37374054],
       [ 1.02450655, -0.68792264],
       [ 1.46920194, -0.66996938],
       [ 0.78980621,  0.68550197],
       [ 1.3086175 , -0.63406285],
       [-0.08723191,  1.31386618],
       [ 0.87627476, -0.42760033],
       [-0.18605311, -0.66099274],
       [ 0.61686912, -0.47248348],
       [ 0.06099988, -0.25704433],
       [ 0.48098997, -0.50839001],
       [ 0.36981612, -0.55327317],
       [ 1.07391715,

In [88]:
minmax_scaler = preprocessing.MinMaxScaler().fit(df[['Alcohol', 'Malic acid']])
minmax_scaler =minmax_scaler.transform(df[['Alcohol', 'Malic acid']])
minmax_scaler

array([[0.84210526, 0.1916996 ],
       [0.57105263, 0.2055336 ],
       [0.56052632, 0.3201581 ],
       [0.87894737, 0.23913043],
       [0.58157895, 0.36561265],
       [0.83421053, 0.20158103],
       [0.88421053, 0.22332016],
       [0.79736842, 0.27865613],
       [1.        , 0.17786561],
       [0.74473684, 0.12055336],
       [0.80789474, 0.28063241],
       [0.81315789, 0.14624506],
       [0.71578947, 0.19565217],
       [0.97894737, 0.19565217],
       [0.88157895, 0.22332016],
       [0.68421053, 0.21146245],
       [0.86052632, 0.23320158],
       [0.73684211, 0.16403162],
       [0.83157895, 0.16798419],
       [0.68684211, 0.46640316],
       [0.79736842, 0.17588933],
       [0.5       , 0.60474308],
       [0.70526316, 0.22134387],
       [0.47894737, 0.16996047],
       [0.65      , 0.21146245],
       [0.53157895, 0.25889328],
       [0.62105263, 0.20355731],
       [0.59736842, 0.19367589],
       [0.74736842, 0.22924901],
       [0.78684211, 0.18577075],
       [0.