In [28]:
import sklearn.preprocessing
import pandas as pd
import numpy as np

In [26]:
import os
print(os.listdir())

['CleanupData.csv', 'DataPreprocessing.ipynb', '.ipynb_checkpoints']


In [3]:
dataFrame = pd.read_csv('./CleanupData.csv')

In [27]:
dataFrame.head()

Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes


## Independent and dependent matrices are just like independent and dependent variables

In [5]:
# Independent matrix
X = dataFrame.iloc[ : , : -1].values
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [29]:
#dependent matrix
y = dataFrame.iloc[ : , 3].values #Do add the .values at the end else it won't convert to a matrix
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Remove NaN values

In [30]:
#Use mean to fill NaN
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
#Or instead of having fit and transform separately you could have used fit_transform
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

## Convert 'Delhi', 'Mumbai', 'Bangalore' to numbers from 0 through 2, each city having an unique number.

In [31]:
#Convert categorical features to continuous data (Convert each of 'Delhi', 'Mumbai', 'Bangalore' into 
#arbitrary but continuous set of numbers)
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

## We do not want to say that delhi > mumbai but this can arise due to the numbering and can thus create a biased model, so we need to use OneHotEncoder to solve this

In [9]:
#But this data is biased as 2>1>0 which ranks some states above others and this may create a biased model. So we 
#split the data into a dummy matrix such that each row of the matrix represents a single value altogether.

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[0])

In [10]:
X = onehotencoder.fit_transform(X).toarray()

In [11]:
np.set_printoptions(suppress=True) #Doing this makes the matrix easier to read.
X

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

In [12]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [32]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Split the dataset into training and testing dataset

In [21]:
#Split dataset into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [34]:
X_train

array([[-0.57735027, -0.77459667,  1.29099445,  0.4984126 , -0.99185042],
       [ 1.73205081, -0.77459667, -0.77459667,  0.73678385,  0.57151232],
       [ 1.73205081, -0.77459667, -0.77459667, -1.21352633, -0.72637373],
       [-0.57735027,  1.29099445, -0.77459667,  1.51690792,  1.6629165 ],
       [-0.57735027, -0.77459667,  1.29099445, -1.79861939, -1.5228038 ],
       [-0.57735027, -0.77459667,  1.29099445,  0.34672181,  0.20279469],
       [-0.57735027,  1.29099445, -0.77459667,  0.15169079,  0.99922477],
       [-0.57735027,  1.29099445, -0.77459667, -0.23837124, -0.19542034]])

## Feature scaling scales each value between -1 to 1. Here, only needed for X coz y is already between -1 and 1

In [23]:
#Feature scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
#Do fit_transform to training dataset and only transform to testing dataset. (REMEMBER FOR NOW!)
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [24]:
X_train

array([[-0.57735027, -0.77459667,  1.29099445,  0.4984126 , -0.99185042],
       [ 1.73205081, -0.77459667, -0.77459667,  0.73678385,  0.57151232],
       [ 1.73205081, -0.77459667, -0.77459667, -1.21352633, -0.72637373],
       [-0.57735027,  1.29099445, -0.77459667,  1.51690792,  1.6629165 ],
       [-0.57735027, -0.77459667,  1.29099445, -1.79861939, -1.5228038 ],
       [-0.57735027, -0.77459667,  1.29099445,  0.34672181,  0.20279469],
       [-0.57735027,  1.29099445, -0.77459667,  0.15169079,  0.99922477],
       [-0.57735027,  1.29099445, -0.77459667, -0.23837124, -0.19542034]])

In [25]:
X_test

array([[ 1.73205081, -0.77459667, -0.77459667,  2.68709402,  3.1230383 ],
       [-0.57735027,  1.29099445, -0.77459667,  2.29703199,  2.59208492]])

# Nais!