In [79]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('Data.csv')

In [80]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## X(features) and Y(predict variable) separation from dataset

In [81]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

## Dealing with missing data

In [82]:
# replacing missing data with he mean of the colum
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

In [83]:
# select the misssing data columns to fit
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [84]:
# checkin out the result
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

In [85]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
onehot_enconder = OneHotEncoder(categorical_features=[0])

#### 1. categorize country as number values

In [86]:
X[:,0] = label_encoder.fit_transform(X[:,0])

#### 2. So the ML algorithm do not thing each country has a bigger value then the other categorize using one hot. A colum for each country will be created with a 0 or 1 value (dummy enconding)

In [87]:
X = onehot_enconder.fit_transform(X).toarray()

In [88]:
pd.DataFrame(data=X,  columns=['France', 'Germany','Spain','Age','Salary'])

Unnamed: 0,France,Germany,Spain,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


### 3. Categorize Y

In [89]:
label_encoder = LabelEncoder()
onehot_enconder = OneHotEncoder(categorical_features=[0])
y = label_encoder.fit_transform(y)

In [91]:
pd.DataFrame(data=y,  columns=['No(0)/Yes(1)'])

Unnamed: 0,No(0)/Yes(1)
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


## Spliting Dataset into Training and Test sets.

In [98]:
from sklearn.model_selection import train_test_split

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [109]:
def show_X_y():
    print('X Train:')
    print(pd.DataFrame(data=X_train,  columns=['France', 'Germany','Spain','Age','Salary']))
    print('\nX Test:')
    print(pd.DataFrame(data=X_test,  columns=['France', 'Germany','Spain','Age','Salary']))
    print('\nY Train:')
    print(pd.DataFrame(data=y_train,  columns=['No(0)/Yes(1)']))
    print('\nX Test:')
    print(pd.DataFrame(data=y_test,  columns=['No(0)/Yes(1)']))
    
show_X_y()

X Train:
   France   Germany     Spain       Age    Salary
0    -1.0  2.645751 -0.774597  0.263068  0.123815
1     1.0 -0.377964 -0.774597 -0.253501  0.461756
2    -1.0 -0.377964  1.290994 -1.975398 -1.530933
3    -1.0 -0.377964  1.290994  0.052614 -1.111420
4     1.0 -0.377964 -0.774597  1.640585  1.720297
5    -1.0 -0.377964  1.290994 -0.081312 -0.167514
6     1.0 -0.377964 -0.774597  0.951826  0.986148
7     1.0 -0.377964 -0.774597 -0.597881 -0.482149

X Test:
   France   Germany     Spain       Age    Salary
0    -1.0  2.645751 -0.774597 -1.458829 -0.901663
1    -1.0  2.645751 -0.774597  1.984964  2.139811

Y Train:
   No(0)/Yes(1)
0             1
1             1
2             1
3             0
4             1
5             0
6             0
7             1

X Test:
   No(0)/Yes(1)
0             0
1             0


## Feature Scaling

In [106]:
from sklearn.preprocessing import StandardScaler
std_scaler_X = StandardScaler()

In [107]:
X_train = std_scaler_X.fit_transform(X_train)
X_test = std_scaler_X.transform(X_test) #do not need to fi because it's already fit by X_train

In [110]:
show_X_y()

X Train:
   France   Germany     Spain       Age    Salary
0    -1.0  2.645751 -0.774597  0.263068  0.123815
1     1.0 -0.377964 -0.774597 -0.253501  0.461756
2    -1.0 -0.377964  1.290994 -1.975398 -1.530933
3    -1.0 -0.377964  1.290994  0.052614 -1.111420
4     1.0 -0.377964 -0.774597  1.640585  1.720297
5    -1.0 -0.377964  1.290994 -0.081312 -0.167514
6     1.0 -0.377964 -0.774597  0.951826  0.986148
7     1.0 -0.377964 -0.774597 -0.597881 -0.482149

X Test:
   France   Germany     Spain       Age    Salary
0    -1.0  2.645751 -0.774597 -1.458829 -0.901663
1    -1.0  2.645751 -0.774597  1.984964  2.139811

Y Train:
   No(0)/Yes(1)
0             1
1             1
2             1
3             0
4             1
5             0
6             0
7             1

X Test:
   No(0)/Yes(1)
0             0
1             0
