### Data encoding
- label encoding

In [1]:
from sklearn.preprocessing import LabelEncoder
items = ['TV','Refrigerator','Microwave','Computer','Fan','Fan','Mixer','Mixer']
#LabelEncoder object creation -> fit->trnasform
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('encoded value:', labels)

encoded value: [5 4 2 0 1 1 3 3]


In [3]:
print('encoding classes:', encoder.classes_)

encoding classes: ['Computer' 'Fan' 'Microwave' 'Mixer' 'Refrigerator' 'TV']


In [4]:
print('decoded original:', encoder.inverse_transform([5,4,2,0,1,1,3,3]))

decoded original: ['TV' 'Refrigerator' 'Microwave' 'Computer' 'Fan' 'Fan' 'Mixer' 'Mixer']


In [5]:
# label encoding is appropriate for the tree algorithm
# not appropriate for the regression

In [11]:
# Befor applying one-hot encoder, all data should be transformed to int type(using LabelEncoder) and 2D array shape
from sklearn.preprocessing import OneHotEncoder
import numpy as np
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
labels = labels.reshape(-1,1) # transform to 2D array
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print('one-hot encoding data')
print(oh_labels.toarray())
print('one-hot encoding data shape')
print(oh_labels.shape)

one-hot encoding data
[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
one-hot encoding data shape
(8, 6)


In [18]:
import pandas as pd
df = pd.DataFrame({'item':['TV','Refrigerator','Microwave','Computer','Fan','Fan','Mixer','Mixer']})
pd.get_dummies(df)

Unnamed: 0,item_Computer,item_Fan,item_Microwave,item_Mixer,item_Refrigerator,item_TV
0,0,0,0,0,0,1
1,0,0,0,0,1,0
2,0,0,1,0,0,0
3,1,0,0,0,0,0
4,0,1,0,0,0,0
5,0,1,0,0,0,0
6,0,0,0,1,0,0
7,0,0,0,1,0,0


### Feature scaling and standardization
- StandardScaler

In [21]:
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
print('mean of features')
print(iris_df.mean())
print('\nvariance of features')
print(iris_df.var())

mean of features
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

variance of features
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [22]:
from sklearn.preprocessing import StandardScaler

# scaler object creation
scaler = StandardScaler()
# call .fit() and .transform()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
# returned ndarray -> DataFrame
iris_df_scaled = pd.DataFrame(data = iris_scaled, columns=iris.feature_names)
print('mean of features')
print(iris_df_scaled.mean())
print('\nvariance of features')
print(iris_df_scaled.var())

mean of features
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

variance of features
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


- MinMaxScaler

In [23]:
from sklearn.preprocessing import MinMaxScaler
# MinMaxScaler object creation
scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
#scaled ndarray -> DataFrame
iris_df_scaled = pd.DataFrame(data = iris_scaled, columns = iris.feature_names)
print('minimum value of features')
print(iris_df_scaled.min())
print('\nmaximum value of features')
print(iris_df_scaled.max())

minimum value of features
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64

maximum value of features
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


- Cautions for using scaler (need to use same object that is fitted already for the trainging data)

In [24]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

In [34]:
scaler = MinMaxScaler()
scaler.fit(train_array) #set as min 0, max 11
train_scaled = scaler.transform(train_array) # 1/10 scaling 
print('original:', np.round(train_array.reshape(-1),2))
print('scaled train_array:', np.round(train_scaled.reshape(-1),2))

original: [ 0  1  2  3  4  5  6  7  8  9 10]
scaled train_array: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [35]:
#test array newly fitting againg (X)
scaler.fit(test_array)
test_scaled = scaler.transform(test_array) #1/5 scaling
print('original:', np.round(test_array.reshape(-1),2))
print('scaled test_array:', np.round(test_scaled.reshape(-1),2))

original: [0 1 2 3 4 5]
scaled test_array: [0.  0.2 0.4 0.6 0.8 1. ]


In [37]:
scaler = MinMaxScaler()
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)
test_scaled = scaler.transform(test_array)
print('original:', np.round(train_array.reshape(-1),2))
print('scaled train_array:', np.round(train_scaled.reshape(-1),2))
print('original:', np.round(test_array.reshape(-1),2))
print('scaled test_array:', np.round(test_scaled.reshape(-1),2))

original: [ 0  1  2  3  4  5  6  7  8  9 10]
scaled train_array: [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]
original: [0 1 2 3 4 5]
scaled test_array: [0.  0.1 0.2 0.3 0.4 0.5]
