In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [None]:
#import category_encoders as ce
from sklearn.model_selection import train_test_split

**Step 2: Importing dataset**

In [None]:
data=pd.read_csv("Data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 3: Handling the missing data**

In [None]:
data.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [None]:
data['Age'].fillna(data.groupby('Country')['Age'].transform('mean'),inplace=True)

In [None]:
data['Salary'].fillna(data.groupby('Country')['Salary'].transform('mean'),inplace=True)

In [None]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,68500.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,32.5,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [None]:
purchased_new = data.select_dtypes(include=['object']).copy()

In [None]:
purchased_new["Purchased"].value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

In [None]:
d1={'Yes':1,'No':0}
purchased_new=purchased_new.replace(d1)
purchased_new.head()

Unnamed: 0,Country,Purchased
0,France,0
1,Spain,1
2,Germany,0
3,Spain,0
4,Germany,1


**Step 5: Creating a dummy variable**

In [None]:
b = []
for i in data.keys():
  b.append(i)
print(b)

['Country', 'Age', 'Salary', 'Purchased']


In [None]:
df = pd.get_dummies(data, columns = ['Country'])

In [None]:
b = []
for i in df.keys():
  b.append(i)
print(b)

['Age', 'Salary', 'Purchased', 'Country_France', 'Country_Germany', 'Country_Spain']


In [None]:
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,No,1,0,0
1,27.0,48000.0,Yes,0,0,1
2,30.0,54000.0,No,0,1,0
3,38.0,61000.0,No,0,0,1
4,40.0,68500.0,Yes,0,1,0
5,35.0,58000.0,Yes,1,0,0
6,32.5,52000.0,No,0,0,1
7,48.0,79000.0,Yes,1,0,0
8,50.0,83000.0,No,0,1,0
9,37.0,67000.0,Yes,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [None]:
b.remove('Purchased')

In [None]:
b

['Age', 'Salary', 'Country_France', 'Country_Germany', 'Country_Spain']

In [None]:
X = df[b].values#array of features
y = df['Purchased'].values

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.05, random_state=0)

In [None]:
X_train

array([[5.00e+01, 8.30e+04, 0.00e+00, 1.00e+00, 0.00e+00],
       [4.00e+01, 6.85e+04, 0.00e+00, 1.00e+00, 0.00e+00],
       [3.70e+01, 6.70e+04, 1.00e+00, 0.00e+00, 0.00e+00],
       [2.70e+01, 4.80e+04, 0.00e+00, 0.00e+00, 1.00e+00],
       [3.25e+01, 5.20e+04, 0.00e+00, 0.00e+00, 1.00e+00],
       [4.80e+01, 7.90e+04, 1.00e+00, 0.00e+00, 0.00e+00],
       [3.80e+01, 6.10e+04, 0.00e+00, 0.00e+00, 1.00e+00],
       [4.40e+01, 7.20e+04, 1.00e+00, 0.00e+00, 0.00e+00],
       [3.50e+01, 5.80e+04, 1.00e+00, 0.00e+00, 0.00e+00]])

In [None]:
X_test

array([[3.0e+01, 5.4e+04, 0.0e+00, 1.0e+00, 0.0e+00]])

In [None]:
y_train

array(['No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes'],
      dtype=object)

In [None]:
y_test

array(['No'], dtype=object)

**Step 7: Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler ## standrard scalig 
scaler = StandardScaler() #initialise to a variable
scaler.fit(X_train,y_train) # we are finding the values of mean and sd from the td
X_train_scaled = scaler.transform(X_train) # fit (mean, sd) and then transform the training data
X_test_scaled = scaler.transform(X_test) # transform the test data

In [None]:
print(X_train_scaled)

[[ 1.57383589  1.58802011 -0.89442719  1.87082869 -0.70710678]
 [ 0.13581325  0.28053352 -0.89442719  1.87082869 -0.70710678]
 [-0.29559354  0.14527629  1.11803399 -0.53452248 -0.70710678]
 [-1.73361618 -1.567982   -0.89442719 -0.53452248  1.41421356]
 [-0.94270373 -1.20729605 -0.89442719 -0.53452248  1.41421356]
 [ 1.28623136  1.22733415  1.11803399 -0.53452248 -0.70710678]
 [-0.15179128 -0.39575265 -0.89442719 -0.53452248  1.41421356]
 [ 0.71102231  0.59613373  1.11803399 -0.53452248 -0.70710678]
 [-0.58319807 -0.66626711  1.11803399 -0.53452248 -0.70710678]]


In [None]:
print(X_test_scaled)

[[-1.30220939 -1.02695307 -0.89442719  1.87082869 -0.70710678]]
