## Encoder Way

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

data = pd.read_csv("input/mushrooms.csv")
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### 方法一：欄位值編碼
Mushrooms的資料欄位皆為為文字類別型，需要轉換成數字才能進行數值運算。最容易的方法就是針對欄位不同值給予一個特定的數字來代表。

In [2]:
# 首先要觀看欄位有多少不同的值
data['cap-shape'].value_counts()

x    3656
f    3152
k     828
b     452
s      32
c       4
Name: cap-shape, dtype: int64

In [3]:
# 每個值給一個特定的數字代碼，然後取代之
data_encode = data.copy()
values = data_encode['cap-shape'].value_counts().index
for i in range(0,len(values)):
    data_encode['cap-shape'][data_encode['cap-shape']==values[i]] = i
data_encode['cap-shape']

0       0
1       0
2       3
3       0
4       0
       ..
8119    2
8120    0
8121    1
8122    2
8123    0
Name: cap-shape, Length: 8124, dtype: object

### 方法二：使用 LabelEncoder

In [4]:
# 將LABEL型的資料數字化
data_encode = data.copy()
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
columns = data_encode.columns #這裡可以改成一個list只包含要編碼的欄位
print(columns)
for col in columns:
    data_encode[col] = labelencoder.fit_transform(data_encode[col])
    
data_encode.head()

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


### 方法三： OneHotEncoder
大部分的類別型欄位使用LabelEncode其實不合理，譬如cap-color中的顏色在同一個欄位中用數值編碼會很奇怪，譬如 [紅色=0, 綠色=1, 藍色=2, 紫色=3, 橙色=4]。直接用數值編碼會有大小之分，如藍色大於紅色？紫色跟藍色比較近，但是跟紅色比較遠？
因此比較好的編碼方法是採用OneHotEncod，也就是每個不同的值變成一個獨立的欄位，如果該值存在就編碼為1，不存在就編碼為0。

In [5]:
data['cap-surface'].value_counts()

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

In [6]:
data_encode = pd.DataFrame()
data_encode['cap-surface'] = data['cap-surface']
data_encode['cap-surface_y'] = data['cap-surface']
data_encode['cap-surface_y'][data['cap-surface']=='y'] = 1
data_encode['cap-surface_y'][data['cap-surface']!='y'] = 0
data_encode['cap-surface_s'] = data['cap-surface']
data_encode['cap-surface_s'][data['cap-surface']=='s'] = 1
data_encode['cap-surface_s'][data['cap-surface']!='s'] = 0
data_encode['cap-surface_f'] = data['cap-surface']
data_encode['cap-surface_f'][data['cap-surface']=='f'] = 1
data_encode['cap-surface_f'][data['cap-surface']!='f'] = 0
data_encode['cap-surface_g'] = data['cap-surface']
data_encode['cap-surface_g'][data['cap-surface']=='g'] = 1
data_encode['cap-surface_g'][data['cap-surface']!='g'] = 0
data_encode.head()

Unnamed: 0,cap-surface,cap-surface_y,cap-surface_s,cap-surface_f,cap-surface_g
0,s,0,1,0,0
1,s,0,1,0,0
2,s,0,1,0,0
3,y,1,0,0,0
4,s,0,1,0,0


In [7]:
#上面的程式也可以寫成
data_encode = pd.DataFrame()
data_encode['cap-surface'] = data['cap-surface']
for value in data['cap-surface'].value_counts().index:
    data_encode['cap-surface_%s'%value] = data['cap-surface']
    data_encode['cap-surface_%s'%value][data['cap-surface']==value] = 1
    data_encode['cap-surface_%s'%value][data['cap-surface']!=value] = 0
data_encode.head()

Unnamed: 0,cap-surface,cap-surface_y,cap-surface_s,cap-surface_f,cap-surface_g
0,s,0,1,0,0
1,s,0,1,0,0
2,s,0,1,0,0
3,y,1,0,0,0
4,s,0,1,0,0


### 方法四：get_dummies

In [8]:
data_encode = pd.get_dummies(data)
data_encode = pd.get_dummies(data, columns=['cap-shape', 'cap-surface'])
data_encode

Unnamed: 0,class,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y
0,p,n,t,p,f,c,n,k,e,e,...,0,0,0,0,0,1,0,0,1,0
1,e,y,t,a,f,c,b,k,e,c,...,0,0,0,0,0,1,0,0,1,0
2,e,w,t,l,f,c,b,n,e,c,...,1,0,0,0,0,0,0,0,1,0
3,p,w,t,p,f,c,n,n,e,e,...,0,0,0,0,0,1,0,0,0,1
4,e,g,f,n,f,w,b,k,t,e,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,n,f,n,a,c,b,y,e,?,...,0,0,0,1,0,0,0,0,1,0
8120,e,n,f,n,a,c,b,y,e,?,...,0,0,0,0,0,1,0,0,1,0
8121,e,n,f,n,a,c,b,n,e,?,...,0,0,1,0,0,0,0,0,1,0
8122,p,n,f,y,f,c,n,b,t,?,...,0,0,0,1,0,0,0,0,0,1


In [9]:
data_encode = pd.get_dummies(data)
data_encode

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
