# Example 1. Pre-processing of data
---
   - Deal with lost data.
   - Numerate words for training.
   - **One-hot encoder**.

In [1]:
import pandas as pd
import numpy as np
from io import StringIO

### 1. Lost data

In [2]:
csv_data = '''A,B,C,D
1.,2.,3.,4.
5.,6.,,8.
0.,11.,12.,'''
csv_data = unicode(csv_data) # for pyhton 2.7

In [3]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [4]:
df.isnull().sum() # check null

A    0
B    0
C    1
D    1
dtype: int64

#### 1.1. Delete sample or feature

In [5]:
df.dropna(axis=0) # check columns and drop rows, default axis=0

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
df.dropna(axis=1) # check rows and drop columns, default axis=0

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,0.0,11.0


In [7]:
df.dropna(how='all') # only drop rows where all columns are NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [8]:
df.dropna(thresh=4) # drop rows that have not at least 4 non-NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
df.dropna(subset=['C']) # only drop rows where NaN appear in specific columns (here:'C')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


#### 1.2. Interpolation techniques

In [10]:
from sklearn.preprocessing import Imputer

In [11]:
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # average of columns
imr.fit(df)
imputed_data = imr.transform(df.values)
pd.DataFrame(data=imputed_data, columns={'A','B','C','D'})

Unnamed: 0,A,C,B,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,0.0,11.0,12.0,6.0


### 2. Numerate feature

In [12]:
df = pd.DataFrame([['green','M',  10.1, 'class1'],
                   ['red',  'L',  13.5, 'class2'],
                   ['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


#### 2.1. Change words to numbers by Numpy

In [13]:
size_mapping = {'XL':3,
                'L' :2,
                'M' :1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [14]:
class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [15]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [16]:
inv_class_mapping = {v:k for k,v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


#### 2.2. Change words to numbers by Sklearn

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0])

In [19]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

#### 2.3. One-hot encoding by Sklearn

In [20]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
ohe = OneHotEncoder(categorical_features=[0]) 
ohe.fit_transform(X).toarray() # or OneHotEncoder(..., sparse=False)

array([[  0. ,   1. ,   0. ,   1. ,  10.1],
       [  0. ,   0. ,   1. ,   2. ,  13.5],
       [  1. ,   0. ,   0. ,   3. ,  15.3]])

#### 2.4. One-hot encoding by Pandas

In [23]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0
