# dealing with cathegorical data

In [1]:
import pandas as pd
import numpy as np
# Python code for binarization 
from sklearn.preprocessing import Binarizer  
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv("pima-indians-diabetes.csv", names=names) 
array = dataframe.values 

In [9]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [11]:
# separate array into input and output components 
# inarize data (set feature values to 0 or 1) according to a threshold
X = array[:,0:8] 
Y = array[:,8] 
X
binarizer = Binarizer(threshold=7.0).fit(X) 
binaryX = binarizer.transform(X) 

In [12]:
binaryX

array([[0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       ...,
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.],
       [0., 1., 1., ..., 1., 0., 1.]])

In [13]:
# summarize transformed data 
np.set_printoptions(precision=3) 
binaryX[0:5,:]

array([[0., 1., 1., 1., 0., 1., 0., 1.],
       [0., 1., 1., 1., 0., 1., 0., 1.],
       [1., 1., 1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 1., 1., 1., 0., 1.],
       [0., 1., 1., 1., 1., 1., 0., 1.]])

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
)

X = ct.fit_transform(X)
X

array([[ 0.   ,  0.   ,  0.   , ..., 33.6  ,  0.627, 50.   ],
       [ 0.   ,  1.   ,  0.   , ..., 26.6  ,  0.351, 31.   ],
       [ 0.   ,  0.   ,  0.   , ..., 23.3  ,  0.672, 32.   ],
       ...,
       [ 0.   ,  0.   ,  0.   , ..., 26.2  ,  0.245, 30.   ],
       [ 0.   ,  1.   ,  0.   , ..., 30.1  ,  0.349, 47.   ],
       [ 0.   ,  1.   ,  0.   , ..., 30.4  ,  0.315, 23.   ]])

##### Convert categorical variable into dummy/indicator variables

In [17]:
data = {'Gender': ['Male', 'Female', 'Male', 'Male'],
        'Age': [20, 21, 19, 18],
        'Salary': [600, 700, 500, 500]}  
data = pd.DataFrame(data)
data

Unnamed: 0,Gender,Age,Salary
0,Male,20,600
1,Female,21,700
2,Male,19,500
3,Male,18,500


In [18]:
pd.get_dummies(data, prefix_sep='_', drop_first=False)

Unnamed: 0,Age,Salary,Gender_Female,Gender_Male
0,20,600,0,1
1,21,700,1,0
2,19,500,0,1
3,18,500,0,1


In [19]:
pd.get_dummies(data, prefix_sep='_', drop_first=True)

Unnamed: 0,Age,Salary,Gender_Male
0,20,600,1
1,21,700,0
2,19,500,1
3,18,500,1


# StandardScaler

In [15]:

# Python code to Standardize data (0 mean, 1 stdev) 
from sklearn.preprocessing import StandardScaler 
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = pd.read_csv("pima-indians-diabetes.csv", names=names) 
array = dataframe.values 

In [16]:
# separate array into input and output components 
# Standardize features by removing the mean and scaling to unit variance
X = array[:,0:8] 
Y = array[:,8] 
scaler = StandardScaler().fit(X) 
rescaledX = scaler.transform(X) 

In [17]:
rescaledX

array([[ 0.64 ,  0.848,  0.15 , ...,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161, ..., -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, ..., -1.103,  0.604, -0.106],
       ...,
       [ 0.343,  0.003,  0.15 , ..., -0.735, -0.685, -0.276],
       [-0.845,  0.16 , -0.471, ..., -0.24 , -0.371,  1.171],
       [-0.845, -0.873,  0.046, ..., -0.202, -0.474, -0.871]])

In [18]:
# summarize transformed data 
np.set_printoptions(precision=3) 
rescaledX[0:5,:]

array([[ 0.64 ,  0.848,  0.15 ,  0.907, -0.693,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161,  0.531, -0.693, -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, -1.288, -0.693, -1.103,  0.604, -0.106],
       [-0.845, -0.998, -0.161,  0.155,  0.123, -0.494, -0.921, -1.042],
       [-1.142,  0.504, -1.505,  0.907,  0.766,  1.41 ,  5.485, -0.02 ]])

In [19]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
