# Data Preprocessing

## Import the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load the data

In [10]:
data = pd.read_csv('/content/Data.csv')
x = data.iloc[: , : -1].values
y = data.iloc[: , -1].values



## Taking Care of missing data

In [12]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan ,strategy = 'mean')
imputer.fit(x[: , 1:3])
x[: , 1:3] = imputer.transform(x[: , 1:3])
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

### Encoding the Independent variable





In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder' , OneHotEncoder(), [0])] , remainder ='passthrough')
x = np.array(ct.fit_transform(x))      
x                

array([[0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

### Encoding the Dependent Variable

In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) #doesnt have to be numpy array as it is a dependent variable
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into Training and Testing set

In [19]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size =0.2 , random_state =1)


array([[1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [20]:
x_train

array([[1.0, 0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [21]:
x_test

array([[1.0, 0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [22]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [23]:
y_test

array([0, 1])

## Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[: , 3:] = sc.fit_transform(x_train[: , 3:])
x_test[: , 3:] = sc.transform(x_test[: , 3:])     #apply the same scaler used on the training set thats why only use transform method

In [28]:
x_train

array([[1.0, 0.0, 0.0, 1.2909944487358058, -0.1915918438457856,
        -1.0781259408412427],
       [1.0, 0.0, 1.0, -0.7745966692414835, -0.014117293757057902,
        -0.07013167641635401],
       [0.0, 1.0, 0.0, -0.7745966692414835, 0.5667085065333239,
        0.6335624327104546],
       [1.0, 0.0, 0.0, 1.2909944487358058, -0.3045301939022488,
        -0.30786617274297895],
       [1.0, 0.0, 0.0, 1.2909944487358058, -1.901801144700799,
        -1.4204636155515822],
       [0.0, 1.0, 0.0, -0.7745966692414835, 1.1475343068237056,
        1.2326533634535488],
       [1.0, 0.0, 1.0, -0.7745966692414835, 1.4379472069688966,
        1.5749910381638883],
       [0.0, 1.0, 0.0, -0.7745966692414835, -0.7401495441200352,
        -0.5646194287757336]], dtype=object)

In [29]:
x_test

array([[1.0, 0.0, 1.0, -0.7745966692414835, -1.4661817944830127,
        -0.9069571034860731],
       [0.0, 1.0, 0.0, -0.7745966692414835, -0.44973664397484425,
        0.20564033932253029]], dtype=object)