# Data Pre-Processing

Data preprocessing in Machine Learning is a crucial step that helps enhance the quality of data to promote the extraction of meaningful insights from the data. Data preprocessing in Machine Learning refers to the technique of preparing (cleaning and organizing) the raw data to make it suitable for a building and training Machine Learning models. In simple words, data preprocessing in Machine Learning is a data mining technique that transforms raw data into an understandable and readable format.

# Import all crucial libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Import the dataset

In [7]:
df=pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


"iloc" stands for "integer location" and is primarily used for accessing and retrieving data from pandas DataFrame objects using integer-based indexing

In [4]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [5]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(y)

['No ' 'Yes' 'No ' 'No ' 'Yes' 'Yes' 'No ' 'Yes' 'No ' 'Yes']


# Identifying and Handling the Missing/NaN Values

In [8]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

# Solution 1: dropna

In [10]:
df1=df.copy()
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [11]:
#Summarize the shape of the raw data
print("Before removing the null values",df1.shape)

#drop rows with missing values
df1.dropna(inplace=True)

##Summarize the shape of the data with missing rows removed
print("After removing the null values",df1.shape)

Before removing the null values (10, 4)
After removing the null values (8, 4)


# Solution 2: fillna

In [12]:
df2=df.copy()
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [17]:
#Fill the missing values with mean column values

df2.fillna(df.mean(),inplace=True)
print(df2.isnull().sum())

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64


  df2.fillna(df.mean(),inplace=True)


In [18]:
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [16]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [20]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])

In [21]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding the categorical data

# Solution 1: ColumnTransformer using Scikit-Learn

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
x=np.array(ct.fit_transform(x))

In [24]:
df.head(5)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [26]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


# Solution 2: pd.get_dummies() by using Pandas

In [28]:
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [29]:
pd.get_dummies(df2)

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,1,0,0,1,0
1,27.0,48000.0,0,0,1,0,1
2,30.0,54000.0,0,1,0,1,0
3,38.0,61000.0,0,0,1,1,0
4,40.0,63777.777778,0,1,0,0,1
5,35.0,58000.0,1,0,0,0,1
6,38.777778,52000.0,0,0,1,1,0
7,48.0,79000.0,1,0,0,0,1
8,50.0,83000.0,0,1,0,1,0
9,37.0,67000.0,1,0,0,0,1


# Solution 3: LabelEncoder

In [31]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit_transform(y)

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

# Splitting the Dataset

In [33]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [34]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [35]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [36]:
print(y_test)

['No ' 'Yes']


In [37]:
print(y_train)

['No ' 'Yes' 'No ' 'No ' 'Yes' 'Yes' 'No ' 'Yes']


# Feature Scaling

# 1.MinMax Scaler

In [41]:
from sklearn.preprocessing import MinMaxScaler
mm=MinMaxScaler()
x_train[:,3:]=mm.fit_transform(x_train[:,3:])
x_test[:,3:]=mm.transform(x_test[:,3:])

In [42]:
print(x_train)

[[0.0 0.0 1.0 0.5120772946859904 0.11428571428571432]
 [0.0 1.0 0.0 0.5652173913043479 0.45079365079365075]
 [1.0 0.0 0.0 0.7391304347826089 0.6857142857142855]
 [0.0 0.0 1.0 0.4782608695652175 0.37142857142857144]
 [0.0 0.0 1.0 0.0 0.0]
 [1.0 0.0 0.0 0.9130434782608696 0.8857142857142857]
 [0.0 1.0 0.0 1.0 1.0]
 [1.0 0.0 0.0 0.34782608695652173 0.2857142857142856]]


In [44]:
print(x_test)

[[0.0 1.0 0.0 0.0 0.0]
 [1.0 0.0 0.0 1.0 1.0]]


# 2.Standard Scaler

In [45]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train[:,3:]=sc.fit_transform(x_train[:,3:])
x_test[:,3:]=sc.fit_transform(x_test[:,3:])

In [46]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578537 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057581 -0.07013167641635436]
 [1.0 0.0 0.0 0.5667085065333245 0.6335624327104541]
 [0.0 0.0 1.0 -0.3045301939022482 -0.3078661727429788]
 [0.0 0.0 1.0 -1.9018011447007983 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237058 1.2326533634535486]
 [0.0 1.0 0.0 1.4379472069688963 1.5749910381638883]
 [1.0 0.0 0.0 -0.740149544120035 -0.5646194287757338]]


In [47]:
print(x_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]
