In [0]:
# import our data

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

# change to the datasets folder present in the mounted drive
%cd /content/drive/My Drive/DataSets

In [0]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# plotting is visible
%matplotlib inline

In [0]:
# import dataset
dataFrame= pd.read_csv('CleanupData.csv')

In [4]:
dataFrame

Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes
5,Delhi,25.0,5800.0,Yes
6,Mumbai,,5200.0,No
7,Delhi,38.0,7900.0,Yes
8,Banglore,40.0,8300.0,No
9,Delhi,27.0,6700.0,Yes


In [0]:
# getting independent and dependent matrix out of dataset
X= dataFrame.iloc[ :, :-1 ].values

In [6]:
# matrix of independent variable
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [7]:
# matrix of dependent variable
y= dataFrame.iloc[:, 3].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### how to handle missing values

In [0]:
from sklearn.impute import SimpleImputer
imputer= SimpleImputer(missing_values=np.NaN, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3]= imputer.transform(X[:, 1:3])

In [9]:
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [10]:
# handle categorical data in our data
from sklearn.preprocessing import LabelEncoder
labelencoder_X= LabelEncoder()
X[:, 0]= labelencoder_X.fit_transform(X[:, 0])
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

In [11]:
# create dummy matrix for categorical dataset
from sklearn.preprocessing import OneHotEncoder
onehotencoder= OneHotEncoder(categorical_features=[0])
X= onehotencoder.fit_transform(X).toarray()
np.set_printoptions(suppress=True)
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

In [12]:
# handle y matrix for categorical data
labelencoder_y= LabelEncoder()
y= labelencoder_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [0]:
# prepare test and training dataset
# split our data into two parts, training and test data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)

In [14]:
X_train

array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ]])

In [15]:
y_train

array([0, 0, 0, 0, 0, 1, 1, 1])

## Feature Sacling (sometimes optional)

In [0]:
# Standard Deviation or normalization

from sklearn.preprocessing import StandardScaler
scale_X= StandardScaler()
X_train= scale_X.fit_transform(X_train)
X_test= scale_X.transform(X_test)

In [17]:
X_train

array([[-0.57735027,  1.29099445, -0.77459667,  0.6730752 ,  0.70662458],
       [ 1.73205081, -0.77459667, -0.77459667,  1.45681774,  1.60782695],
       [-0.57735027, -0.77459667,  1.29099445, -0.00907109, -0.93192517],
       [ 1.73205081, -0.77459667, -0.77459667, -1.15565741, -0.7680702 ],
       [-0.57735027, -0.77459667,  1.29099445, -0.11066735, -0.19457778],
       [-0.57735027,  1.29099445, -0.77459667, -0.50253862, -0.44036025],
       [-0.57735027, -0.77459667,  1.29099445, -1.54752868, -1.25963513],
       [-0.57735027,  1.29099445, -0.77459667,  1.19557023,  1.280117  ]])