In [1]:
import sys, os

sys.path.append(os.path.abspath("Datasets"))

In [2]:
for line in open("Datasets/ML_a_z/Data.csv"):
    print line

Country,Age,Salary,Purchased

France,44,72000,No

Spain,27,48000,Yes

Germany,30,54000,No

Spain,38,61000,No

Germany,40,,Yes

France,35,58000,Yes

Spain,,52000,No

France,48,79000,Yes

Germany,50,83000,No

France,37,67000,Yes


### Importing the Libraries


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [4]:
pd.read_csv("Datasets/ML_a_z/Data.csv")

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
# Importing the dataset
dataset = pd.read_csv("Datasets/ML_a_z/Data.csv")

In [6]:
X = dataset.iloc[:, :-1].values

In [7]:
Y = dataset.iloc[:, -1].values

In [8]:
# How to deal with missing data
from sklearn.preprocessing import Imputer
# Use Imputation transformer

In [9]:
imputer = Imputer(missing_values="NaN", strategy='mean',
                 axis=0)
# axis=0 : take the mean of the column

In [10]:
imputer = imputer.fit(X[:, 1:])

In [11]:
X[:, 1:] = imputer.transform(X[:, 1:])

In [12]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [13]:
# How to deal with categorical data
# Country = [France, Spain, Germany]
# Purchased = [Yes, No]
# => Need to encode into numbers

In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoder_X = LabelEncoder()

In [16]:
X[:, 0] = label_encoder_X.fit_transform(X[:, 0])

In [17]:
X

array([[0L, 44.0, 72000.0],
       [2L, 27.0, 48000.0],
       [1L, 30.0, 54000.0],
       [2L, 38.0, 61000.0],
       [1L, 40.0, 63777.77777777778],
       [0L, 35.0, 58000.0],
       [2L, 38.77777777777778, 52000.0],
       [0L, 48.0, 79000.0],
       [1L, 50.0, 83000.0],
       [0L, 37.0, 67000.0]], dtype=object)

Problem: In machine learning, we want to transform text into numbers, so that we can include it into the equation. In the above result, Spain will have value 2, and France is 0, so it can be misunderstood that Spain will be more important than France. IT MAKES no SENSE! => That's why we use Dummy Variable => Each will have 1 column!



In [18]:
from sklearn.preprocessing import OneHotEncoder

In [19]:
one_hot_encoder = OneHotEncoder(categorical_features=[0])

In [20]:
X = one_hot_encoder.fit_transform(X).toarray()

In [21]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [22]:
label_encoder_y = LabelEncoder()

In [23]:
Y 

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

In [24]:
Y = label_encoder_y.fit_transform(Y)

In [25]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

### Splitting the dataset into training and test sets
- Why? Machine learning: machine learns something, need understanding of correlation. 
- Machine needs to learn the correlation, not learn them by heart(Overfitting) => The in-sample and out-of-sample performance should also be good

In [28]:
from sklearn.cross_validation import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                   test_size=0.2,
                                                   random_state=0)

### Feature Scaling
- Problem: Age and Salary do not have the same scale, it can cause problem in the machine learning model
- Reason: Because the machine learning models are based on Eucledian Distance. So the Salary has much larger scale than the Age, so the Eucledian Distance will be dominated by the Salary!
- So, they should be transformed into the same scale
- There are 2 ways for scaling feature

1. **Standardisation**
$$x_{stand}= \frac{x - mean(x)}{\sigma^2(x)}$$

2. **Normalization**
$$x_{norm} = \frac{x - min(x)}{max(x) - min(x)}$$

In [41]:
# Feature scaling code
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [42]:
X_train = sc_X.fit_transform(X_train)
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [43]:
X_test = sc_X.transform(X_test)

In [44]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

### Data Preprocessing template
- Taking care of missing data is also not too important
- Feature scaling is not too important, depends on the library you use