In [1]:
# importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Importing the datset

In [2]:
dataset = pd.read_csv('data.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#### Making the Features Matrix (Independent variables) and the Target Variables vector (Dependent Variables)

In [3]:
X = dataset.iloc[:,:-1].values
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [4]:
Y = dataset.iloc[:,3].values
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handling the missing data in our dataset

1. fill the empty data in the column with the mean of the column. This can be done by using Scikit learn preprocessing library which have a class called Imputer. 

In [5]:
# importing the required library

from sklearn.impute import SimpleImputer

In [15]:
imputer = SimpleImputer(strategy="mean")
X_numerical = X[:,1:3]
X_categorical = X[:, 0].reshape(-1, 1)
imputer.fit(X_numerical)
print(imputer.transform(X_numerical))

[[4.40000000e+01 7.20000000e+04]
 [2.70000000e+01 4.80000000e+04]
 [3.00000000e+01 5.40000000e+04]
 [3.80000000e+01 6.10000000e+04]
 [4.00000000e+01 6.37777778e+04]
 [3.50000000e+01 5.80000000e+04]
 [3.87777778e+01 5.20000000e+04]
 [4.80000000e+01 7.90000000e+04]
 [5.00000000e+01 8.30000000e+04]
 [3.70000000e+01 6.70000000e+04]]


In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_numerical)
# np.set_printoptions(precision=2)
X[:, 1:3] = X_imputed
print(X[:,1:3])

[[44.0 72000.0]
 [27.0 48000.0]
 [30.0 54000.0]
 [38.0 61000.0]
 [40.0 63777.77777777778]
 [35.0 58000.0]
 [38.77777777777778 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]]


## Encoding Categorical Data

In machine learning, we work with numerical data so the data with categorical values can cause some problems. SO we encode this categorical data to numbers. This technique of converting categorical data to numericals is called encoding.

In python we have Label encoder class to perform the encoding.

In [9]:
# importing the library

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [10]:
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
print(X)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


#### One Hot Encoder to create the dummy variable

In [16]:
onehotencoder = OneHotEncoder()
encoded_categorical_data = onehotencoder.fit_transform(X_categorical).toarray()
X = np.concatenate((encoded_categorical_data, X[:, 1:]), axis=1)
print(X)



[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [17]:
#label encoder to y which is our target variable
labelencoder = LabelEncoder()
Y = labelencoder.fit_transform(Y)
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])