# Import Libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import Dataset

In [7]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [8]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [9]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


# Handle missing data

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X.iloc[:, 1:3])
X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3])

In [12]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


# Encode categorical data

## Encode independent variable

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
for row in X:
    formatted_row = ' '.join([f"{val:.2f}" for val in row])
    print(formatted_row)

# print(X)

1.00 0.00 0.00 44.00 72000.00
0.00 0.00 1.00 27.00 48000.00
0.00 1.00 0.00 30.00 54000.00
0.00 0.00 1.00 38.00 61000.00
0.00 1.00 0.00 40.00 63777.78
1.00 0.00 0.00 35.00 58000.00
0.00 0.00 1.00 38.78 52000.00
1.00 0.00 0.00 48.00 79000.00
0.00 1.00 0.00 50.00 83000.00
1.00 0.00 0.00 37.00 67000.00


## Encode dependent variable

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Split into training and testing sets

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [21]:
for row in X_train:
    formatted_row = ' '.join([f"{val:.2f}" for val in row])
    print(formatted_row)

0.00 0.00 1.00 38.78 52000.00
0.00 1.00 0.00 40.00 63777.78
1.00 0.00 0.00 44.00 72000.00
0.00 0.00 1.00 38.00 61000.00
0.00 0.00 1.00 27.00 48000.00
1.00 0.00 0.00 48.00 79000.00
0.00 1.00 0.00 50.00 83000.00
1.00 0.00 0.00 35.00 58000.00


In [22]:
for row in X_test:
    formatted_row = ' '.join([f"{val:.2f}" for val in row])
    print(formatted_row)

0.00 1.00 0.00 30.00 54000.00
1.00 0.00 0.00 37.00 67000.00


In [24]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [25]:
print(y_test)

[0 1]


# Feature Scale

In [26]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [27]:
for row in X_train:
    formatted_row = ' '.join([f"{val:.2f}" for val in row])
    print(formatted_row)

0.00 0.00 1.00 -0.19 -1.08
0.00 1.00 0.00 -0.01 -0.07
1.00 0.00 0.00 0.57 0.63
0.00 0.00 1.00 -0.30 -0.31
0.00 0.00 1.00 -1.90 -1.42
1.00 0.00 0.00 1.15 1.23
0.00 1.00 0.00 1.44 1.57
1.00 0.00 0.00 -0.74 -0.56


In [28]:
for row in X_test:
    formatted_row = ' '.join([f"{val:.2f}" for val in row])
    print(formatted_row)

0.00 1.00 0.00 -1.47 -0.91
1.00 0.00 0.00 -0.45 0.21
