In [133]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [134]:
# set working directory folder - press f5 - current directory set to working dir

dataset=pd.read_csv('Data.csv')

In [135]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [136]:
# create matrix of independent variables and dependent variable

X=dataset.iloc[:,:-1].values #first : is rows (all), 2nd : is columns - take all but last col
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [137]:
# create dependent variable vector - all rows (first :) and only col 3
Y=dataset.iloc[:,3].values
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

In [138]:
# missing data tutorial

# can remove the lines with missing data - not best option
# replace missing data with mean of column values

In [139]:
# replace missing data with mean of column values
# import scikitlearn and use imputer class

from sklearn.preprocessing import Imputer

In [140]:
# create object of the Imputer class

imputer=Imputer(missing_values='NaN',strategy='mean',axis=0)

In [141]:
# fit the imputer to the columns with missing data

imputer=imputer.fit(X[:,1:3]) # fit to columns 1 and 2

In [142]:
# replace missing data by mean of column - use transform method

X[:,1:3]=imputer.transform(X[:,1:3])

In [143]:
X # NaN values replaced by mean

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [144]:
# encoding Categorical data

# ML models based on mathematical eqns - can't handle categorical variables
# need to encode the cat variables

In [145]:
# import encoder from sklearn

from sklearn.preprocessing import LabelEncoder

In [146]:
LE_X = LabelEncoder()

In [147]:
LE_X.fit_transform(X[:,0])

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=int64)

In [148]:
X[:,0]=LE_X.fit_transform(X[:,0])
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [149]:
# problem - eqns are math - therefore a value of 2 will look > value of 1
# implies one country is greater than another!
# need to prevent this - use dummy variables
# 3 columns - 1 for each

In [150]:
# create dummy variables - use another class

from sklearn.preprocessing import OneHotEncoder

In [151]:
ohe=OneHotEncoder(categorical_features=[0])

In [152]:
X=ohe.fit_transform(X).toarray()

In [153]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [154]:
# for purchased column only need label encoder since relative value of number 
#is not important

LE_Y = LabelEncoder()
Y=LE_Y.fit_transform(Y)

In [155]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [156]:
# split data into training and test sheet

In [157]:
from sklearn.model_selection import train_test_split

In [166]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,
                                                random_state=0)

In [167]:
# feature scaling
# features are not on same scale - many ML models are based on Euclidean
# distance. therefore Euclidean distance will be dominated by the larger 
# absolute values. Need to put the variables on the same scale.

# common ways: Standardization, Normalization

In [168]:
from sklearn.preprocessing import StandardScaler

In [169]:
SC_X = StandardScaler()

In [170]:
X_train=SC_X.fit_transform(X_train)

In [171]:
X_test=SC_X.transform(X_test) # do we need to scale dummy variables? up to you'''

In [172]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [173]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])