# Importing the Libraries

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the Dataset

In [66]:
dataset = pd.read_csv("./01-data-preprocessing.csv")
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [67]:
# Matrics of features
independent = dataset.iloc[:, :-1]
independent

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [68]:
# dependent variable vector
dependent = dataset.iloc[:, 3:]
dependent

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No
9,Yes


# Handling Missing Data

In [69]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [70]:
imputer = imputer.fit(independent.iloc[:, 1:3])
independent.iloc[:, 1:3] = imputer.transform(independent.iloc[:, 1:3])
independent

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


# Encode Categorical Data

In [71]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [72]:
labelEncoder = LabelEncoder()
oneHotEncoder = OneHotEncoder(handle_unknown='ignore')

independent["Country"] = labelEncoder.fit_transform(independent["Country"])
encoded_country = pd.DataFrame(oneHotEncoder.fit_transform(independent[["Country"]]).toarray())

independent = encoded_country.join(independent)
independent.drop("Country", axis=1, inplace=True)

independent

Unnamed: 0,0,1,2,Age,Salary
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [73]:
labelEncoder = LabelEncoder()

dependent["Purchased"] = labelEncoder.fit_transform(dependent["Purchased"])
dependent

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1



# Splitting Data into Training Set and Test Set

In [74]:
from sklearn.model_selection import train_test_split

independent_train, independent_test, dependent_train, dependent_test = train_test_split(
    independent, dependent, test_size=0.3, random_state=0) 

In [75]:
independent_train

Unnamed: 0,0,1,2,Age,Salary
9,1.0,0.0,0.0,37.0,67000.0
1,0.0,0.0,1.0,27.0,48000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
3,0.0,0.0,1.0,38.0,61000.0
0,1.0,0.0,0.0,44.0,72000.0
5,1.0,0.0,0.0,35.0,58000.0


In [76]:
dependent_train

Unnamed: 0,Purchased
9,1
1,1
6,0
7,1
3,0
0,0
5,1


In [77]:
independent_test

Unnamed: 0,0,1,2,Age,Salary
2,0.0,1.0,0.0,30.0,54000.0
8,0.0,1.0,0.0,50.0,83000.0
4,0.0,1.0,0.0,40.0,63777.777778


In [78]:
dependent_test

Unnamed: 0,Purchased
2,0
8,0
4,1


# Feature Scaling

In [79]:
from sklearn.preprocessing import StandardScaler
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)

In [80]:
sc = StandardScaler()

independent_train = pd.DataFrame(sc.fit_transform(independent_train))
independent_train

Unnamed: 0,0,1,2,3,4
0,0.866025,0.0,-0.866025,-0.202981,0.448971
1,-1.154701,0.0,1.154701,-1.821689,-1.417064
2,-1.154701,0.0,1.154701,0.084789,-1.024215
3,0.866025,0.0,-0.866025,1.577598,1.627519
4,-1.154701,0.0,1.154701,-0.04111,-0.140303
5,0.866025,0.0,-0.866025,0.930115,0.940033
6,0.866025,0.0,-0.866025,-0.526723,-0.43494


In [81]:
independent_test = pd.DataFrame(sc.transform(independent_test))
independent_test

Unnamed: 0,0,1,2,3,4
0,-1.154701,1.0,-0.866025,-1.336077,-0.82779
1,-1.154701,1.0,-0.866025,1.90134,2.020369
2,-1.154701,1.0,-0.866025,0.282632,0.132509
