# Data Preprocessing

### Importing packages

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


## Importing dataset

In [60]:
dataset = pd.read_csv('Data.csv')

In [61]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### For Independent variable we will take all the rows and all  the colum except last one in a array

In [62]:
x = dataset.iloc[:,:-1].values 

### For 2d array : [rows,colums] -> (from 1 to : n-1 ) , ':' means all , ':-1' means all except last one

In [63]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### Lets take the 3rd column in y as Dependent variable

In [64]:
# y = dataset.iloc[:,-1:].values
y = dataset.iloc[:,3].values

In [65]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Handling the missing data

### Import library

In [66]:
from sklearn.impute import SimpleImputer

In [67]:
imputer = SimpleImputer(missing_values=np.nan,strategy = "mean")

In [68]:
imputer = imputer.fit(x[:,1:3])

In [69]:
x[:,1:3] = imputer.transform(x[:,1:3])

In [70]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)


## Encoding catagorical Data

### Import packages

In [71]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [72]:
labelencoder_x = LabelEncoder()

### Encode the country names into integers

In [73]:
x[:,0] = labelencoder_x.fit_transform(x[:,0])

In [74]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

#### Now this could be problem for machine that they can assume that lebel 0 is less then 1 or 2 but they are just labels

### Encode Dummy variables like 



In [75]:
transformer = ColumnTransformer(
    transformers = [
        ("OneHot",
         OneHotEncoder(),
         [0]
        )
    ],
    remainder='passthrough'
)

In [76]:
x = transformer.fit_transform(x)

In [77]:
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### AS there are only two type varibales in y , yes and no we can use labelencoder

In [78]:
labelencoder_y = LabelEncoder() 

In [79]:
y = labelencoder_y.fit_transform(y)

In [80]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Split the dataset into training set  and Test Set

### Import Libraries

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0) #'0.2' means 20%

In [83]:
x_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [84]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [85]:
x_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0]], dtype=object)

In [86]:
x_train

array([[0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

## Feature Scaling

In [87]:
from sklearn.preprocessing import StandardScaler

In [88]:
sc_x = StandardScaler()

In [89]:
x_train = sc_x.fit_transform(x_train)

In [90]:
x_test = sc_x.transform(x_test)

In [91]:
x_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [92]:
x_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])