# Data preprocessing 

Data preprocessing is a process of preparing the raw data and making it suitable for a machine learning model. It is the first and crucial step while creating a machine learning model

***Steps in data preprocessing***

Step 1 :- Importing the libraries

In [1]:
from tkinter import Label
import numpy as np
import matplotlib.pyplot as mpt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler 

step 2 :- importing the dataset


In [2]:
data_set = pd.read_csv('Data.csv')


In [3]:
# extracting independent and dependent variables

x=data_set.iloc[:,:-1].values
y=data_set.iloc[:,3].values

In [4]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [5]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


step 3 :- Handling the missing data (Replacing missing data with mean value)

In [6]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# fitting imputer object to the independent variable x.
imputer = imputer.fit(x[:, 1:3])
# Replacing missing data with the calculated mean value
x[:, 1:3] = imputer.transform(x[:, 1:3])
# print(x)

step 4 :- Encoding categorical data

In [7]:
# Since machine learning model completely works on mathematics and numbers, 
# but if our dataset would have a categorical variable, then it may create 
# trouble while building the model. So it is necessary to encode these categorical 
# variables into numbers.

# For country variable

label_encoder_x=LabelEncoder()
x[:,0]=label_encoder_x.fit_transform(x[:,0])
print(x)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 63777.77777777778]
 [0 35.0 58000.0]
 [2 38.77777777777778 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


In [8]:
# Dummy encoding using oneHotEncoder
ct=ColumnTransformer([('Country',OneHotEncoder(),[0])],remainder='passthrough')
x=np.array(ct.fit_transform(x),dtype=np.float)
print(x)


[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x=np.array(ct.fit_transform(x),dtype=np.float)


In [9]:
# for purchased variable
labelencoder_y=LabelEncoder()
y=labelencoder_y.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


step 5 :- splitting the data set into training and test data set

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)


step 6 :- feature Scaling

In [11]:
st_x=StandardScaler()
x_train=st_x.fit_transform(x_train)

x_test=st_x.fit_transform(x_test)

print(x_train)
print(x_test)

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]
[[ 0.  0.  0. -1. -1.]
 [ 0.  0.  0.  1.  1.]]
