In [1]:
#Data.csv

**Step 1: Importing the libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Step 2: Importing dataset**

In [3]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
df.shape

(10, 4)

**Step 3: Handling the missing data**

In [5]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [6]:
df.Age.mean(), df.Age.median()

(38.77777777777778, 38.0)

In [7]:
df.Salary.mean(), df.Salary.median()

(63777.77777777778, 61000.0)

In [8]:
x = 38.0
df.Age = df.Age.fillna(x)

In [9]:
x = 61000.0
df.Salary = df.Salary.fillna(x)

In [10]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [11]:
df.describe()

Unnamed: 0,Age,Salary
count,10.0,10.0
mean,38.7,63500.0
std,7.257946,11597.413505
min,27.0,48000.0
25%,35.5,55000.0
50%,38.0,61000.0
75%,43.0,70750.0
max,50.0,83000.0


In [13]:
q3 = df.Age.quantile(0.75)
q1 = df.Age.quantile(0.25)
iqr = q3 - q1
upper = q3 + (iqr*1.5)
lower = q1 - (iqr*1.5)
lower, upper

(24.25, 54.25)

In [12]:
q3 = df.Salary.quantile(0.75)
q1 = df.Salary.quantile(0.25)
iqr = q3 - q1
upper = q3 + (iqr*1.5)
lower = q1 - (iqr*1.5)
lower, upper

(31375.0, 94375.0)

In [14]:
df.drop_duplicates()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [15]:
df.shape

(10, 4)

**Step 4: Encoding categorical data**

In [16]:
df.Purchased = df.Purchased.map({'Yes' : 1, 'No' : 0})

In [17]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,61000.0,1
5,France,35.0,58000.0,1
6,Spain,38.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [24]:
df = pd.get_dummies(data = df)
df

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,61000.0,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.0,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [25]:
df.columns

Index(['Age', 'Salary', 'Purchased', 'Country_France', 'Country_Germany',
       'Country_Spain'],
      dtype='object')

In [26]:
from sklearn.model_selection import train_test_split

X = df[['Age', 'Salary', 'Country_France', 'Country_Germany', 'Country_Spain']].values
y = df['Purchased'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

**Step 7: Feature Scaling**

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)