In [1]:
# importing os module to verify the dataset file
import os
os.listdir('./dataset/')

['CleanupData.csv']

### importing necessary libraries

In [2]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# for inline plotting
%matplotlib inline

# seaborn importation
import seaborn as sns

### Loading dataset

In [3]:
# creating a dataFrame var to store dataset
dataFrame = pd.read_csv(filepath_or_buffer = './dataset/CleanupData.csv')

### Getting dataFrame information

In [4]:
# checking DF info
dataFrame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
State               10 non-null object
Age                 9 non-null float64
Pocket Money        9 non-null float64
Course Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 280.0+ bytes


In [5]:
# checking the DF feaures and labels
dataFrame

Unnamed: 0,State,Age,Pocket Money,Course Purchased
0,Delhi,34.0,7200.0,No
1,Mumbai,17.0,4800.0,Yes
2,Banglore,20.0,5400.0,No
3,Mumbai,28.0,6100.0,No
4,Banglore,30.0,,Yes
5,Delhi,25.0,5800.0,Yes
6,Mumbai,,5200.0,No
7,Delhi,38.0,7900.0,Yes
8,Banglore,40.0,8300.0,No
9,Delhi,27.0,6700.0,Yes


### Separating independent and dependent variables

In [6]:
# getting independent matrix part from DF
X = dataFrame.iloc[ : , :-1 ].values

In [7]:
# checking the independent matrix
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, nan],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', nan, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

In [8]:
# getting dependent matrix part from DF
y = dataFrame.iloc[ : , -1 ].values

In [9]:
# checking the dependent matrix
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handling missing values

In [10]:
# handling missing NaN values in the age and pocket money features
from sklearn.impute import SimpleImputer

In [11]:
sImputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean')

In [12]:
sImputer.fit(X[ : , 1:])
X[ : , 1:] = sImputer.transform(X[ : , 1:])
X

array([['Delhi', 34.0, 7200.0],
       ['Mumbai', 17.0, 4800.0],
       ['Banglore', 20.0, 5400.0],
       ['Mumbai', 28.0, 6100.0],
       ['Banglore', 30.0, 6377.777777777777],
       ['Delhi', 25.0, 5800.0],
       ['Mumbai', 28.77777777777778, 5200.0],
       ['Delhi', 38.0, 7900.0],
       ['Banglore', 40.0, 8300.0],
       ['Delhi', 27.0, 6700.0]], dtype=object)

### Handling categorical data in X matrix

In [13]:
# handling categorical features in X matrix
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
X[ : , 0] = labelEncoder_X.fit_transform(X[ : , 0])
X

array([[1, 34.0, 7200.0],
       [2, 17.0, 4800.0],
       [0, 20.0, 5400.0],
       [2, 28.0, 6100.0],
       [0, 30.0, 6377.777777777777],
       [1, 25.0, 5800.0],
       [2, 28.77777777777778, 5200.0],
       [1, 38.0, 7900.0],
       [0, 40.0, 8300.0],
       [1, 27.0, 6700.0]], dtype=object)

### Creating dummy matrix to equalize the X[ : , 0] using OneHotEncoder

In [14]:
# creating dummy matrix for X
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
np.set_printoptions(suppress = True)


# checking the dummy matrix of X
X

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[   0.        ,    1.        ,    0.        ,   34.        ,
        7200.        ],
       [   0.        ,    0.        ,    1.        ,   17.        ,
        4800.        ],
       [   1.        ,    0.        ,    0.        ,   20.        ,
        5400.        ],
       [   0.        ,    0.        ,    1.        ,   28.        ,
        6100.        ],
       [   1.        ,    0.        ,    0.        ,   30.        ,
        6377.77777778],
       [   0.        ,    1.        ,    0.        ,   25.        ,
        5800.        ],
       [   0.        ,    0.        ,    1.        ,   28.77777778,
        5200.        ],
       [   0.        ,    1.        ,    0.        ,   38.        ,
        7900.        ],
       [   1.        ,    0.        ,    0.        ,   40.        ,
        8300.        ],
       [   0.        ,    1.        ,    0.        ,   27.        ,
        6700.        ]])

### Handling categorical feature of y matrix

In [15]:
# encoding labels of y matrix
labelEncoder_y = LabelEncoder()
y = labelEncoder_y.fit_transform(y)

# checking y matrix
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

### Preparing training and test dataset

In [16]:
# splitting the X and y matrix into training and test dataset using sklearn

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, train_size = 0.8)

In [19]:
# checking train and test dataset

X_test

array([[   1.,    0.,    0.,   20., 5400.],
       [   0.,    1.,    0.,   34., 7200.]])

In [20]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1])

### Feature Scaling for X_train and X_test

In [21]:
# feature scaling the X_train and X_test dataset for ML model using SD or Normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [22]:
# checking the X_train

X_train

array([[-0.57735027, -0.77459667,  1.29099445, -0.18029153, -0.26067572],
       [-0.57735027, -0.77459667,  1.29099445, -1.80291533, -1.40082746],
       [-0.57735027,  1.29099445, -0.77459667, -0.6228253 , -0.52378766],
       [-0.57735027,  1.29099445, -0.77459667, -0.32780279,  0.26554816],
       [ 1.73205081, -0.77459667, -0.77459667,  0.11473098, -0.01705355],
       [-0.57735027, -0.77459667,  1.29099445, -0.06556056, -1.05001154],
       [ 1.73205081, -0.77459667, -0.77459667,  1.58984352,  1.66881184],
       [-0.57735027,  1.29099445, -0.77459667,  1.29482101,  1.31799592]])

In [23]:
X_test

array([[ 1., -1.,  0., -1., -1.],
       [-1.,  1.,  0.,  1.,  1.]])