In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv("simple_data.csv")

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
X = dataset.iloc[:, :-1].values #takes all columns except the last 1

In [5]:
X #would traditionally be called a matrix of features for ML

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y = dataset.iloc[:,-1].values #takes the last column
#This is our dependent variable.

In [7]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

Suppose we have missing values in our dataset - we need a way to deal with that. A majority of time is often spent cleaning the data and structuring it in a way that is suitable to run ML algorithms. The following will illustrate how to deal with missing code.

In [8]:
from sklearn.preprocessing import Imputer # this is a class! - refresh OOP if this confuses you.

In [9]:
imputer = Imputer(missing_values='NaN', strategy="mean", axis=0)
#replaces missing values which are represented as NaN with the mean
#of the column. We know we are dealing with a column as the axis=0
#if axis=1 then we would be dealing with rows. Makes sense for column
#here as we are dealing with features

Now we need to fit this imputer object to our data!

In [10]:
imputer = imputer.fit(X[:, 1:3]) #gives us the imputed values
#recall that you specify where you want the fit.
#in this case we want the fit on the last 2 columns only (age, salary)
#and we update the matrix by  
X[:, 1:3] = imputer.transform(X[:, 1:3])
#this line actually changes X with the imputed values

In [11]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Now we are going to learn how to encode categorical data.

Country, Purchased etc.

In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder #this is a class!

In [13]:
labelencoder_x = LabelEncoder()

In [14]:
X[:, 0] = labelencoder_x.fit_transform(X[:, 0])

In [15]:
X #now you see how it is encoded.

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

Note - we use the labelencoder before onehotencoder in order to turn strings into integers (i.e. 'Spain' --> 2). It is only after we have turned these categorical variables into numeric types that we can turn them into dummy variables with onehotencoder.

Now, note how the encoding as some form of value in that the computer might think that 2 (spain) is higher in value than 0 (France). We need to correct this. If our category was small, medium, large, then we could use this form of numbering as one really is bigger than the other.

The answer to solving this is Dummy Variables!

In [16]:
onehotencoder = OneHotEncoder(categorical_features=[0])
#we have to specify which columns contain the categorical stuff.
#countries is in index 0

In [17]:
X = onehotencoder.fit_transform(X).toarray()
#no need to respecify where in X ^^ as we did that already with [0]

In [18]:
X

array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.0000000

In [19]:
V = pd.DataFrame(X)
pd.set_option('precision',0)

In [20]:
V #you can see from here that 3 coulumns were added for the dummies.

Unnamed: 0,0,1,2,3,4
0,1,0,0,44,72000
1,0,0,1,27,48000
2,0,1,0,30,54000
3,0,0,1,38,61000
4,0,1,0,40,63778
5,1,0,0,35,58000
6,0,0,1,39,52000
7,1,0,0,48,79000
8,0,1,0,50,83000
9,1,0,0,37,67000


Column 0 corresponds to france, 1 to spain and 3 is germany.

Thats all there is to encoding.

Now we need to do the same for the depedent variable y. However, we dont need to onehotencode anything here as we are using ML and the algorithsm know that Y is categorical. Furthermore, there is a relational meaning between 0 and 1.

As such, we just use label encoder

In [22]:
labelencoder_y = LabelEncoder()

In [23]:
y = labelencoder_y.fit_transform(y)

In [24]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

0 is no and 1 is yes

Also, note how we used the labelencoder above and NOT onehotencode. This is a soure of condusion sometimes.

Labelencoder: String -> int/float (if binary then we can stop here), if not binary and we don't want to imply value distinction --> onehotencoder to create dummies.

Now we are going to split the dataset into training and test set.

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                   random_state = 0)

^^ 20% of the data will be used for testing, thus it is obvious that the training set is 0.8

Now we will learn about feature scaling.

Age and salary have vastly different values, and ML algortihms often use euclidean distances. Thus the square of a salary value will vastly outweight the effect of age^2 - thus, we scale our values to fix this issue.

There are two types of feature scaling:

- Standardisation is:
    (x-mean(x))/sd(x)

- Normalization is:
    (x-min(x))/(max(x)-min(x))

Even in instances where ML algos dont use euclidean distances, we still need to feature scale as it will allow the ML algo to converge faster. This is especially true for decision trees ( we will come back to this way later)

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
sc_X = StandardScaler()

In [30]:
X_train = sc_X.fit_transform(X_train)
#note that we need to fit AND transform the training set
#we will only transform the test set.
X_test = sc_X.transform(X_test)
#there is debate over whether we should scale the dummy variables.
#it depends on the context and for interpretation
#if we scale the dummy, then we will lose the interpretation of a 1
#referring to a particular country, this makes interpretation weird
#however, it could be beneficial for our model.
#since this we are not going to interpret
#we will scale everything, including the dummies

In [31]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [32]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

Do we do feature scaling to the dependent value? No, in this instance we do not, becacuse we have a classification problem with a categorical dependent variable. However, later we will see in some regression problems where the dependent variable can take a large range of values - then we will deal with feature scaling.

A confusion that might arise is, why do we type the follwing code:

In [33]:
# X_train = sc_X.fit_transform(X_train)

Should we not fit and transform the test set too? The answer is NO!

the .fit_transform(X_train) essentially finds the 'fit' of the training set, that is, outliers, min, max etc, and then based on that fit, transforms the data (feature scaling). the fit is 'stored' in our Standard Scaler object and thus we neednt fit AND transform out X_test, rather, we need to just transform it based on the fit.

An alternative way to write that code segement would be the following:

In [34]:
# sc_X = StandardScaler()
# sc_X.fit(X_train)
# X_train = sc_X.transform(X_train)
# X_test = sc_X.transform(X_test)

Another concern might be, why dont we have the fit on X rather than on X_train? THis is a valid point, however, since X_train represents 80% of X, we are taking a slight shortcut by fitting and transforming on X_train directly as it is obvious that X_train and X will have extrememly similar characteristics.

This may bother some people, thus an alternative way to write this would be as follows:

In [35]:
# sc_X = StandardScaler()
# sc_X.fit(X)
# X_train = sc_X.transform(X_train)
# X_test = sc_X.transform(X_test)