In [67]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [106]:
data = pd.read_csv('../datasets/Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [107]:
X = np.array(data.drop(axis = 1, columns = ['Purchased']))
y = data.Purchased

# Filling 'NaN' values

In [108]:
from sklearn.preprocessing import Imputer

In [109]:
imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [110]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 61000.0],
       ['France', 35.0, 58000.0],
       ['Spain', 38.0, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Categorical Data

In [111]:
from sklearn.preprocessing import LabelEncoder

## Label Encoder

In [112]:
labelEnconder_X = LabelEncoder()
X[:, 0] = labelEnconder_X.fit_transform(X[:, 0])

In [113]:
X # Encoded values in place of categories
# Problem: the model will learn that these 3 categories have
# relational order between themselves. Germany is not 'greater'
# than France. ONLY use in case of relational order between
# categories.

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 61000.0],
       [0, 35.0, 58000.0],
       [2, 38.0, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

## Dummy Encoding

In [114]:
from sklearn.preprocessing import OneHotEncoder

In [115]:
oneHotEncoder = OneHotEncoder(categorical_features = [0])
X = oneHotEncoder.fit_transform(X).toarray()

In [116]:
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 4.0e+01, 6.1e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 5.2e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.8e+01, 7.9e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

Never use all of your Dummy variables. One of them will depend on the others, so it would be simpler and less redundant to exclude one variable. This is called the "Dummy Variable Trap".

In [117]:
labelEnconder_y = LabelEncoder()
y = labelEnconder_y.fit_transform(y)

In [118]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting Data

In [119]:
from sklearn.model_selection import train_test_split

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [121]:
print('Data set:',len(data),'\nX_train:',len(X_train),'- X_test:',len(X_test),'\ny_train:',len(y_train),'- y_test:',len(y_test))

Data set: 10 
X_train: 8 - X_test: 2 
y_train: 8 - y_test: 2


# Feature Scaling

In [122]:
from sklearn.preprocessing import StandardScaler

In [123]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [124]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.27978024, -0.13108063],
       [ 1.        , -0.37796447, -0.77459667, -0.23673712,  0.49810638],
       [-1.        , -0.37796447,  1.29099445, -1.95846165, -1.49431914],
       [-1.        , -0.37796447,  1.29099445, -0.06456467, -1.07486114],
       [ 1.        , -0.37796447, -0.77459667,  1.65715986,  1.75648039],
       [-1.        , -0.37796447,  1.29099445, -0.06456467, -0.13108063],
       [ 1.        , -0.37796447, -0.77459667,  0.96847005,  1.02242888],
       [ 1.        , -0.37796447, -0.77459667, -0.58108203, -0.44567413]])

In [125]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.44194429, -0.86513213],
       [-1.        ,  2.64575131, -0.77459667,  2.00150476,  2.1759384 ]])

In [126]:
# We don't need to apply feature scaling to the class variable.
# In case of regression, since the variable can take a huge range
# of values, we will need to use feature scaling.