# Data Preprocessing

## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

## Importing Dataset

In [2]:
df = pd.read_csv('data.csv')

In [3]:
# examine dataset 
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# independent variable: matrix of feature 
X = df.iloc[:, :-1]
# dependent variable: matrix of target  
y = df.iloc[:, -1]

In [5]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [6]:
y 

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Handling Missing Data 

In [7]:
# check missing data 
df.isnull().sum() 

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [8]:
# mean salary
X.Salary.mean()

63777.77777777778

In [9]:
# mean age 
X.Age.mean() 

38.77777777777778

In [10]:
# meadin salary 
X.Salary.median() 

61000.0

In [11]:
# median age 
X.Age.median()

38.0

In [12]:
# impute with mean  
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, 
                        strategy='mean')
imputer.fit(X.iloc[:, 1:3])
X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3]) 

In [None]:
# impute with median 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, 
                        strategy='median')
imputer.fit(X.iloc[:, 1:3])
X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3]) 

In [13]:
X 

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


## Encoding Categorical Data 

In [7]:
# encode the independent variable 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X)) 

In [8]:
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+01,     nan],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04],
       [0.0e+00, 0.0e+00, 1.0e+00,     nan, 5.2e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.8e+01, 7.9e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

In [10]:
# encoding dependent variable 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() 
y = le.fit_transform(y)

In [11]:
y 

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting Data

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [13]:
X_train

array([[0.0e+00, 0.0e+00, 1.0e+00,     nan, 5.2e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+01,     nan],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.8e+01, 7.9e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04]])

In [14]:
X_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

In [15]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [16]:
y_test

array([0, 1])

## Feature Scaling

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() 
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.fit_transform(X_test[:, 3:])

In [19]:
X_train

array([[ 0.        ,  0.        ,  1.        ,         nan, -1.018224  ],
       [ 0.        ,  1.        ,  0.        , -0.03891021,         nan],
       [ 1.        ,  0.        ,  0.        ,  0.50583275,  0.58347667],
       [ 0.        ,  0.        ,  1.        , -0.31128169, -0.2974587 ],
       [ 0.        ,  0.        ,  1.        , -1.80932482, -1.33856413],
       [ 1.        ,  0.        ,  0.        ,  1.0505757 ,  1.1440719 ],
       [ 0.        ,  1.        ,  0.        ,  1.32294718,  1.46441204],
       [ 1.        ,  0.        ,  0.        , -0.71983891, -0.5377138 ]])

In [20]:
X_test

array([[ 0.,  1.,  0., -1., -1.],
       [ 1.,  0.,  0.,  1.,  1.]])