## Data Preprocessing

In [1]:
import pandas as pd 
from io import StringIO

csv_data = '''A, B, C, D
              1.0, 2.0, 3.0,
              5.0,,7.0,8.0
              9.0,,11.0,12.0'''

In [2]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,,7.0,8.0
2,9.0,,11.0,12.0


In [3]:
df.isnull().sum()

A     0
 B    2
 C    0
 D    1
dtype: int64

In [4]:
dfn = df


### Missing Values

In [5]:
# drop rows with missing values
dfn.dropna(axis=0)

Unnamed: 0,A,B,C,D


In [6]:
# drop column with at least on row with Nan value
df.dropna(axis=1)

Unnamed: 0,A,C
0,1.0,3.0
1,5.0,7.0
2,9.0,11.0


In [7]:
# drop rows where all columns are Nan
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,,7.0,8.0
2,9.0,,11.0,12.0


In [8]:
# drop rows with threshold
df.dropna(thresh=1)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,,7.0,8.0
2,9.0,,11.0,12.0


### Imputing Values

In [9]:
from sklearn.impute import SimpleImputer
import numpy as np
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df.values)
imputed_data = imp.transform(df.values)
imputed_data

array([[ 1.,  2.,  3., 10.],
       [ 5.,  2.,  7.,  8.],
       [ 9.,  2., 11., 12.]])

In [10]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,10.0
1,5.0,2.0,7.0,8.0
2,9.0,2.0,11.0,12.0


### Handling Categorical Data

In [11]:
# categorical data encoding
df = pd.DataFrame([
    ['Red','M','599','class2',],
    ['Green','L','600','class1'],
    ['Blue','S','499','class2']
])
df.columns = ['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,Red,M,599,class2
1,Green,L,600,class1
2,Blue,S,499,class2


In [12]:
# mapping ordinal feature
size_mapping = {'L': 3,
                'M': 2,
                'S': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,Red,2,599,class2
1,Green,3,600,class1
2,Blue,1,499,class2


In [13]:
# encoding class labels
class_mapping = {label: idx for idx,label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [14]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df['classlabel']

0    1
1    0
2    1
Name: classlabel, dtype: int64

In [15]:
df

Unnamed: 0,color,size,price,classlabel
0,Red,2,599,1
1,Green,3,600,0
2,Blue,1,499,1


In [16]:
# using Scikit-learn
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [17]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

X = df[['color','size','price']].values
one_hot = OneHotEncoder()
one_hot.fit_transform(X[:,0].reshape(1,-1)).toarray()

array([[1., 1., 1.]])

### Dataset Partioning

In [20]:
df = pd.read_csv('./data/wine.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [21]:
df.columns = ['Class label','Alcohol',
              'Malic acid','Ash',
              'Alcalinity of ash', 'Magnesium',
              'Total Phenol', 'Flavanoids',
              'Nonflavanoid Phenols', 'Proanthocyanins',
              'Color intensity','Hue',
              'OD280/OD315 Dialuted Wines',
              'Proline']

In [22]:
print('Class labels', np.unique(df['Class label']))

Class labels [1 2 3]


In [23]:
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total Phenol,Flavanoids,Nonflavanoid Phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 Dialuted Wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [25]:
from sklearn.model_selection import train_test_split
X, y = df.iloc[:, 1:].values, df.iloc[:,0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=1,
                                                    stratify=y)

### Feature Scaling

In [26]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.fit_transform(X_test)

In [27]:
# Standardization
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)