## 處理遺漏值

In [1]:
import pandas as pd
from io import StringIO

In [11]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
NaN,NaN,NaN,NaN'''
df = pd.read_csv(StringIO(csv_data))

In [12]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,,,,


In [13]:
df.isnull()
#檢查是否為空值

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True
3,True,True,True,True


In [14]:
df.isnull().sum()
# 計算null的資料筆數

A    1
B    1
C    2
D    2
dtype: int64

### 刪除遺漏值  

In [15]:
df.dropna()
#將後兩筆刪除了,預設 axis=0

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [16]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [17]:
df.dropna(axis=1)

0
1
2
3


In [18]:
# only drop rows where all columns are NaN
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [19]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [20]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


### Impot (補值) 

In [21]:
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1.        ,   2.        ,   3.        ,   4.        ],
       [  5.        ,   6.        ,   7.5       ,   8.        ],
       [ 10.        ,  11.        ,  12.        ,   6.        ],
       [  5.33333333,   6.33333333,   7.5       ,   6.        ]])

In [22]:
df.values

array([[  1.,   2.,   3.,   4.],
       [  5.,   6.,  nan,   8.],
       [ 10.,  11.,  12.,  nan],
       [ nan,  nan,  nan,  nan]])

### 處理分類數據


In [23]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
                   ['red', 'L', 13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


# 有序特徵 ex:size , XL＞L＞M  
# 名目特徵 ex:color

### 對應有序特徵

In [24]:
size_mapping = {'XL': 3,
                'L': 2,
                'M': 1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [26]:
# 反向對應
inv_size_mapping = {v: k for k, v in size_mapping.items()}

In [27]:
inv_size_mapping

{1: 'M', 2: 'L', 3: 'XL'}

In [28]:
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

### 將類別標籤　轉成　整數值

In [30]:
df['classlabel']

0    class1
1    class2
2    class1
Name: classlabel, dtype: object

In [32]:
import numpy as np

np.unique(df['classlabel'])

array(['class1', 'class2'], dtype=object)

In [33]:
enumerate(df['classlabel'])

<enumerate at 0x9fb99d8>

In [34]:
# scikit-learn 大多會自動轉
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [35]:
#內建函數 (function) enumerate() ，回傳以參數 (parameter) iterable 與連續整數配對的 enumerate 物件， start 為整數的起始值，預設為 0 
d = ['Spring', 'Summer', 'Fall', 'Winter']
for i, j in enumerate(d, 1):
    print(i, j)

1 Spring
2 Summer
3 Fall
4 Winter


In [36]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [37]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [38]:
# 其實直接用 scikit LabelEncoder 比較快

from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0], dtype=int64)

In [39]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

### 對名目特徵做　one-hot encoding

In [40]:
X = df[['color', 'size', 'price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [41]:
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [42]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

In [43]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


# 將數據集分成訓練集和測試集

In [None]:

df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

In [None]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)

## 縮放特徵 


In [None]:
# 最大最小縮放　normalized
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

X_test_norm

In [None]:
# 標準化縮放　standardized
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
ex = pd.DataFrame([0, 1, 2, 3, 4, 5])

# standardize
ex[1] = (ex[0] - ex[0].mean()) / ex[0].std(ddof=0)

# normalize
ex[2] = (ex[0] - ex[0].min()) / (ex[0].max() - ex[0].min())
ex.columns = ['input', 'standardized', 'normalized']
ex