In [1]:
import numpy as np
import pandas as pd
from io import StringIO

In [2]:
data = '''A,B,C,D
1.0,2.0,3.0,4.0
,6.0,,8.0
10.0,11.0,12.0'''

In [18]:
df = pd.read_csv(StringIO(data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,,6.0,,8.0
2,10.0,11.0,12.0,


# 1 查詢遺失值

In [4]:
#使用isnull()查詢遺失值
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,True,False,True,False
2,False,False,False,True


In [5]:
#查看各變項遺失值的數量
df.isnull().sum()

A    1
B    0
C    1
D    1
dtype: int64

# 2 處理遺失值

## 2-1 刪除

In [6]:
###axis = 0為處理col，axis = 1為處理row
df.dropna(axis = 0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [7]:
#刪除變項小於3的row
df.dropna(thresh=3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [8]:
#刪除變項C有遺失值的row
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## 2-2 差補法

* 平均差補(strategy = 'mean')
* 中位數差補(strategy = 'median')
* 多數差補(strategy = 'most_frequent')###分類變數常用

In [19]:
#方法一
df['A'] = df['A'].fillna( df['A'].mean() )

In [21]:
#方法二
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imr.fit(df)
imr_df = imr.transform(df)
imr_df

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5.5,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

## 2-3 類別變數處理

In [10]:
clothes = pd.DataFrame([['G', 'M', 10.1, 'class1'], 
                   ['R', 'L', 13.5, 'class2'], 
                   ['B', 'XL', 15.3, 'class1']])
clothes.columns = ['color', 'size', 'price', 'classlabel']
clothes

Unnamed: 0,color,size,price,classlabel
0,G,M,10.1,class1
1,R,L,13.5,class2
2,B,XL,15.3,class1


In [11]:
###順序變數轉換
size_mapping = {'XL': 3, 'L':2, 'M':1}
clothes['size'] = clothes['size'].map(size_mapping)
clothes

Unnamed: 0,color,size,price,classlabel
0,G,1,10.1,class1
1,R,2,13.5,class2
2,B,3,15.3,class1


In [12]:
###反向尋找
inv_size_mapping = {v:k for k, v in size_mapping.items()}
clothes['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## 2-4 名目變數處理(Labelencoder、OneHotEncoder)

In [13]:
###名目變數轉換(label)
class_mapping = {label:idx for idx, label in enumerate(np.unique(clothes['classlabel']))}
clothes['classlabel'] = clothes['classlabel'].map(class_mapping)
clothes

Unnamed: 0,color,size,price,classlabel
0,G,1,10.1,0
1,R,2,13.5,1
2,B,3,15.3,0


In [14]:
###名目變數轉換(labelencoder)
###from sklearn.preprocessing import LabelEncoder
###class_le = labelEncoder()
###y = class_le.fit_transform(clothes['classlabel'].values)
###y
###反向尋找
###class_le.inverse_transform(y)

In [15]:
#需先將文字轉為數字
from sklearn.preprocessing import LabelEncoder
xx = clothes.iloc[:,0:3].values
color_le = LabelEncoder()
xx[:, 0] = color_le.fit_transform(xx[:, 0])
xx

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [16]:
###名目變數轉換(OneHotEncoder)
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(xx).toarray()

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [17]:
#get_dummies可快速轉換one_hot 或 dummy_variable(drop_first=True)
pd.get_dummies(clothes[['price', 'color', 'size']],drop_first=True )

Unnamed: 0,price,size,color_G,color_R
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0
