## Data Preprocessing

In [1]:
import pandas as pd
from io import StringIO

In [2]:
csv_data = '''A,B,C,D 
              1,2,3,4,
              5,6,7,8,
              9,10,11,12,13'''
csv_data = unicode(csv_data)

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
df

Unnamed: 0,A,B,C,D
1,2,3,4,
5,6,7,8,
9,10,11,12,13.0


In [5]:
df.isnull().sum()

A     0
B     0
C     0
D     2
dtype: int64

In [7]:
# change the NaN values with the mean
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean',axis=1)
imputer = imputer.fit(df.values)
imputed_data = imputer.fit_transform(df.values)
imputed_data

array([[ 2.,  3.,  4.,  3.],
       [ 6.,  7.,  8.,  7.],
       [10., 11., 12., 13.]])

In [8]:
df = pd.DataFrame([
        ['green','M','10','class1'],
        ['red','S','13','class2'],
        ['blue','XL','12','class1']
    ])
df.columns = ['color','size','price','class label']

In [9]:
df

Unnamed: 0,color,size,price,class label
0,green,M,10,class1
1,red,S,13,class2
2,blue,XL,12,class1


In [11]:
size_mapping = {
    'M': 2,
    'S': 1,
    'XL':3
}

In [12]:
df['size'] = df['size'].map(size_mapping)

In [13]:
df

Unnamed: 0,color,size,price,class label
0,green,2,10,class1
1,red,1,13,class2
2,blue,3,12,class1


## Encoding Class Label

In [14]:
import numpy as np

class_mapping = {label: idx for idx,label in enumerate(np.unique(df['class label']))}
class_mapping

{'class1': 0, 'class2': 1}

In [15]:
df['class label'] = df['class label'].map(class_mapping)
df

Unnamed: 0,color,size,price,class label
0,green,2,10,0
1,red,1,13,1
2,blue,3,12,0


In [16]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['class label'].values)

In [17]:
y

array([0, 1, 0])

## One-hot Encoding

In [19]:
X = df[['color','size','price']].values
class_le = LabelEncoder()
X[:,0] = class_le.fit_transform(X[:,0])
X

array([[1, 2, '10'],
       [2, 1, '13'],
       [0, 3, '12']], dtype=object)

In [20]:
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(categorical_features=[0])
one_hot.fit_transform(X).toarray()

array([[ 0.,  1.,  0.,  2., 10.],
       [ 0.,  0.,  1.,  1., 13.],
       [ 1.,  0.,  0.,  3., 12.]])

In [23]:
pd.get_dummies(df[['price','color','size']])

Unnamed: 0,size,price_10,price_12,price_13,color_blue,color_green,color_red
0,2,1,0,0,0,1,0
1,1,0,0,1,0,0,1
2,3,0,1,0,1,0,0


In [27]:
pd.get_dummies(df[['color','size','price']], drop_first=True)

Unnamed: 0,size,color_green,color_red,price_12,price_13
0,2,1,0,0,0
1,1,0,1,0,1
2,3,0,0,1,0


In [28]:
data = pd.read_csv('./data/california_cities.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,city,latd,longd,elevation_m,elevation_ft,population_total,area_total_sq_mi,area_land_sq_mi,area_water_sq_mi,area_total_km2,area_land_km2,area_water_km2,area_water_percent
0,0,Adelanto,34.576111,-117.432778,875.0,2871.0,31765,56.027,56.009,0.018,145.107,145.062,0.046,0.03
1,1,AgouraHills,34.153333,-118.761667,281.0,922.0,20330,7.822,7.793,0.029,20.26,20.184,0.076,0.37
2,2,Alameda,37.756111,-122.274444,,33.0,75467,22.96,10.611,12.349,59.465,27.482,31.983,53.79
3,3,Albany,37.886944,-122.297778,,43.0,18969,5.465,1.788,3.677,14.155,4.632,9.524,67.28
4,4,Alhambra,34.081944,-118.135,150.0,492.0,83089,7.632,7.631,0.001,19.766,19.763,0.003,0.01


In [42]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                             'ml/machine-learning-databases/'
                             'wine/wine.data', header=None)

In [43]:
df_wine.columns = ['Class label', 'Alcohol',
                   'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium',
                   'Total phenols', 'Flavanoids',
                   'Nonflavanoid phenols',
                   'Proanthocyanins',
                   'Color intensity', 'Hue',
                   'OD280/OD315 of diluted wines',
                   'Proline']

In [45]:
print('Class Labels', np.unique(df_wine['Class label']))

('Class Labels', array([1, 2, 3]))


In [46]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [47]:
from sklearn.model_selection import train_test_split
X,y = df_wine.iloc[:,1:].values, df_wine.iloc[:,0].values
xtrain,xtest,ytrain,ytest = train_test_split(X,y, test_size=0.3, random_state=0, stratify=y)

## Feature Scaling

In [49]:
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
x_train_norm = min_max.fit_transform(xtrain)
x_test_norm = min_max.fit_transform(xtest)

In [53]:
from sklearn.preprocessing import StandardScaler 
std = StandardScaler()
x_train_std = std.fit_transform(xtrain)
x_test_std = std.fit_transform(xtest)

## Feature Selection: Regularization and Dimensionality Reduction