# Data pre-processing step-wise process

## Step-2:Import the required libraries

In [1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Step-3:Import the dataset

In [2]:
dataset=pd.read_csv("Data.csv")
dataset.shape

(10, 4)

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


## Extracting the independent and dependent variables from dataset using iloc[] function

In [5]:
#for independent variables
x=dataset.iloc[:,:-1]
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [6]:
#for dependent variable
y=dataset.iloc[:,3]
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Step-4:Dealing with missing values

Finding the missing values

In [7]:
dataset.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


Number of Missing values in features 

In [8]:
dataset.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

## Removing the missing values from the dataset

Firstly,on independent variables

In [9]:
#import the SimpleImputer class from scikit-learn library
from sklearn.impute import SimpleImputer

#create a dataframe
dataset1=pd.DataFrame(dataset)


#select the columns (only those columns which have numerical data)
columns_to_impute=['Age','Salary']

#apply the SimpleImputer class with strategy as mean
imputer=SimpleImputer(strategy='mean')

#fit and transform the imputer on selected columns
dataset1[columns_to_impute]=imputer.fit_transform(dataset1[columns_to_impute])


#print the dataset1
print(dataset1)

   Country        Age        Salary Purchased
0   France  44.000000  72000.000000        No
1    Spain  27.000000  48000.000000       Yes
2  Germany  30.000000  54000.000000        No
3    Spain  38.000000  61000.000000        No
4  Germany  40.000000  63777.777778       Yes
5   France  35.000000  58000.000000       Yes
6    Spain  38.777778  52000.000000        No
7   France  48.000000  79000.000000       Yes
8  Germany  50.000000  83000.000000        No
9   France  37.000000  67000.000000       Yes


Now missing values are all cleared

In [10]:
dataset1.isnull().sum()

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64

## Step-5:Encoding Categorical data

Encoding Categorical data firstly on independent variable

In [11]:
#import the LableEncoder class from scikit-learn library(to convert categorical feature into numerical feature)
from sklearn.preprocessing import LabelEncoder

#create a new dataframe
dataset2=pd.DataFrame(dataset1)

#create a LabelEncoder instance
label_encoder=LabelEncoder()

#Apply the labelencoder to the categorical column
dataset2['Country']=label_encoder.fit_transform(dataset2['Country'])

#print the new dataframe(dataset2)
print(dataset2)

   Country        Age        Salary Purchased
0        0  44.000000  72000.000000        No
1        2  27.000000  48000.000000       Yes
2        1  30.000000  54000.000000        No
3        2  38.000000  61000.000000        No
4        1  40.000000  63777.777778       Yes
5        0  35.000000  58000.000000       Yes
6        2  38.777778  52000.000000        No
7        0  48.000000  79000.000000       Yes
8        1  50.000000  83000.000000        No
9        0  37.000000  67000.000000       Yes


## Making the Dummy Variables in the form of 0 or 1

We use dummy variables because there is some corrlation between the actual variable which will produce the wrong output.

In [12]:
#For Dummy variable import onehotencoder class from scikit-learn library
from sklearn.preprocessing import OneHotEncoder

#create an dataframe(dataset3) which will contain only the dummy variable 
dataset3=pd.DataFrame(dataset2)

#create an instance for onehotencoder
one_hot_encoder=OneHotEncoder()

#Encode the categorical column
encoded_data=one_hot_encoder.fit_transform(dataset2[['Country']]).toarray()

#get the onehotencoded features out from the dataset2
encoded_feature_names = one_hot_encoder.get_feature_names_out(['Country'])

#create the dataset3 with the encoded features
dataset3=pd.DataFrame(encoded_data,columns=encoded_feature_names)

#drop the original 'country' feature.
dataset2.drop(columns=['Country'], inplace=True)

#concat the result of dataset2 and dataset3
dataset4 = pd.concat([dataset2,dataset3], axis=1)

#now print the result
dataset4


Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
0,44.0,72000.0,No,1.0,0.0,0.0
1,27.0,48000.0,Yes,0.0,0.0,1.0
2,30.0,54000.0,No,0.0,1.0,0.0
3,38.0,61000.0,No,0.0,0.0,1.0
4,40.0,63777.777778,Yes,0.0,1.0,0.0
5,35.0,58000.0,Yes,1.0,0.0,0.0
6,38.777778,52000.0,No,0.0,0.0,1.0
7,48.0,79000.0,Yes,1.0,0.0,0.0
8,50.0,83000.0,No,0.0,1.0,0.0
9,37.0,67000.0,Yes,1.0,0.0,0.0


## Encode the dependent categorical feature(target variable)

In [13]:
#import the labelencoder class from scikit-leran library
from sklearn.preprocessing import LabelEncoder

#create the new dataframe(dataset5)
dataset5=pd.DataFrame(dataset4)

#create the instance of label encoder
label_encoder_y=LabelEncoder()

#Apply the label encoder on dependent variable(purchased)
dataset5['Purchased']=label_encoder_y.fit_transform(dataset5['Purchased'])

#print the dataset5
dataset5

Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
0,44.0,72000.0,0,1.0,0.0,0.0
1,27.0,48000.0,1,0.0,0.0,1.0
2,30.0,54000.0,0,0.0,1.0,0.0
3,38.0,61000.0,0,0.0,0.0,1.0
4,40.0,63777.777778,1,0.0,1.0,0.0
5,35.0,58000.0,1,1.0,0.0,0.0
6,38.777778,52000.0,0,0.0,0.0,1.0
7,48.0,79000.0,1,1.0,0.0,0.0
8,50.0,83000.0,0,0.0,1.0,0.0
9,37.0,67000.0,1,1.0,0.0,0.0


## Step-6:Splitting the dataset into the Training set and Test set

In [14]:
x=dataset5.iloc[:,[0,1,3,4,5]]
x

Unnamed: 0,Age,Salary,Country_0,Country_1,Country_2
0,44.0,72000.0,1.0,0.0,0.0
1,27.0,48000.0,0.0,0.0,1.0
2,30.0,54000.0,0.0,1.0,0.0
3,38.0,61000.0,0.0,0.0,1.0
4,40.0,63777.777778,0.0,1.0,0.0
5,35.0,58000.0,1.0,0.0,0.0
6,38.777778,52000.0,0.0,0.0,1.0
7,48.0,79000.0,1.0,0.0,0.0
8,50.0,83000.0,0.0,1.0,0.0
9,37.0,67000.0,1.0,0.0,0.0


In [15]:
y=dataset5.iloc[:,2]
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int32

In [16]:
#import the train_test_split class form scikit-leran library
from sklearn.model_selection import train_test_split

#divide the training data and testing data with testsize as 0.2 such as (80::20)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [17]:
x_train

Unnamed: 0,Age,Salary,Country_0,Country_1,Country_2
4,40.0,63777.777778,0.0,1.0,0.0
9,37.0,67000.0,1.0,0.0,0.0
1,27.0,48000.0,0.0,0.0,1.0
6,38.777778,52000.0,0.0,0.0,1.0
7,48.0,79000.0,1.0,0.0,0.0
3,38.0,61000.0,0.0,0.0,1.0
0,44.0,72000.0,1.0,0.0,0.0
5,35.0,58000.0,1.0,0.0,0.0


In [18]:
y_train

4    1
9    1
1    1
6    0
7    1
3    0
0    0
5    1
Name: Purchased, dtype: int32

In [19]:
x_test

Unnamed: 0,Age,Salary,Country_0,Country_1,Country_2
2,30.0,54000.0,0.0,1.0,0.0
8,50.0,83000.0,0.0,1.0,0.0


In [20]:
y_test

2    0
8    0
Name: Purchased, dtype: int32

## Step-7:Feature Scaling (Final step of data-preprocessing)

It is technique to standardize the independent variables of the dataset in a specific range.

In [21]:
#for standardization import standardscaler class from scikit-learn library
from sklearn.preprocessing import StandardScaler

#create a new database(dataset6)
dataset6=pd.DataFrame(dataset5)

#columns for scaling
columns_to_scale = ['Age', 'Salary']

#create the instance of standard scaler
scaler=StandardScaler()

#Apply the standard scaler on columns
dataset6[columns_to_scale] = scaler.fit_transform(dataset6[columns_to_scale])

#print the dataset
dataset6


Unnamed: 0,Age,Salary,Purchased,Country_0,Country_1,Country_2
0,0.758874,0.7494733,0,1.0,0.0,0.0
1,-1.711504,-1.438178,1,0.0,0.0,1.0
2,-1.275555,-0.8912655,0,0.0,1.0,0.0
3,-0.113024,-0.2532004,0,0.0,0.0,1.0
4,0.177609,6.632192e-16,1,0.0,1.0,0.0
5,-0.548973,-0.5266569,1,1.0,0.0,0.0
6,0.0,-1.07357,0,0.0,0.0,1.0
7,1.34014,1.387538,1,1.0,0.0,0.0
8,1.630773,1.752147,0,0.0,1.0,0.0
9,-0.25834,0.2937125,1,1.0,0.0,0.0


##   \**Now we have completed the major steps for data preprocessing in machine learning\**