In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [7]:
DATA_SET_PATH = './Data.csv'

df = pd.read_csv(DATA_SET_PATH)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [17]:
# iloc can take two parameters, the first one is for rows, and the second one is for columns
# : means all, :-1 means all but the last one
X = df.iloc[:, :-1].values
y = df.iloc[:, 3].values
X, y

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object))

In [23]:
# Take care of missing data
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [26]:
# Encode categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder_x = LabelEncoder()
# encode the first column Country using number, orderly
X[:, 0] = label_encoder_x.fit_transform(X[:, 0])
'''
Because LabelEncoder encodes thing from 0 - n-1, which is ordered. But we cannot say France is better than
Germany, so we need to further encode
encode the first column Country by flattening all the possible values to columns
say, we have three different countries, then there will be three columns after encoding,
and the value of each country will be either 1 or 0 to indicate which country that row is.
'''
one_hot_encoder = OneHotEncoder(categorical_features = [0])
X = one_hot_encoder.fit_transform(X).toarray()

label_encoder_y = LabelEncoder()
# encode the first column Country using number, orderly
y = label_encoder_x.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [30]:
# split the dataset into the training set and test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [33]:
# Feature scaling
'''
Because values under different columns could have very different scales, say one is always above 1000, while
another is below 100. This different scale makes it really hard to compare, and also will cost us accuracy.
Because lots of machine learning algorithms depend on Euclidean distance (namely, get the distance between two dots
with x,y as coordinates), if we have a column with a very large scale, then the other column with small scale
will be ignored, as the large scale value will be dominant
sqrt((squre(x1) - squre(x2)) + (squre(y1) - squre(y2))):
as you can see, if y is much larger than x, than x would not make any difference
'''
# StandardScaler is just to use z-score to transform
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

array([[ 1.        , -1.        ,  2.64575131, -0.77459667,  0.26306757,
         0.12381479],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.25350148,
         0.46175632],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -1.97539832,
        -1.53093341],
       [ 1.        , -1.        , -0.37796447,  1.29099445,  0.05261351,
        -1.11141978],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  1.64058505,
         1.7202972 ],
       [ 1.        , -1.        , -0.37796447,  1.29099445, -0.0813118 ,
        -0.16751412],
       [-1.        ,  1.        , -0.37796447, -0.77459667,  0.95182631,
         0.98614835],
       [-1.        ,  1.        , -0.37796447, -0.77459667, -0.59788085,
        -0.48214934]])