# DATA PREPROCESSING

## Importing The Libraries & Dataset

In [2]:
# IMPORTING THE LIBRARIES

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns

In [5]:
# IMPORTING THE DATASET

data = pd.read_csv('Data.csv')  # GETTING OUR DATASET

X = data.iloc[:,0:3].values    # GETTING THE VALUES OF INDEPENDENT VARIABLES
y = data.iloc[:,3].values      # GETTING THE VALUES OF DEPENDENT VARIABLES

In [6]:
# GETTING OUR DEPENDENT VARIABLES

X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
# GETTING OUR INDEPENDENT VARIABLE

y 

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Handling The Missing Values In Dataset

In [10]:
# IMPORTING THE CLASS FOR FILLING THE MISSING VALUES BY MEAN IN DATASET

from sklearn.impute import SimpleImputer
impute = SimpleImputer(missing_values = np.nan, strategy = 'mean')

In [14]:
# IMPUTING THE MEAN VALUE IN AGE AND SALARY 

impute.fit(X[:,1:3])
X[:,1:3] = impute.transform(X[:,1:3])

In [15]:
# CHECKING THE X AGAIN

X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Handling The Categorical Columns In Dataset

In [23]:
# IMPORTING THE CLASSES FOR HANDLING THE CATEGORICAL COLUMNS IN DATASET

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)

In [24]:
# CHECKING THE X AGAIN

X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [26]:
# LABEL ENCODING THE DEPENDENT VARIABLE

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y = le.fit_transform(y)

In [27]:
# CHECKING THE y AGAIN

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Dividing Data Into Train Test Split

In [29]:
# IMPORTING THE train_test_split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [30]:
# CHECKING X_train

X_train

array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object)

In [31]:
# CHECKING X_test

X_test

array([[0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [32]:
# CHECKING y_train

y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [33]:
# CHECKING y_test

y_test

array([0, 1])

## Feature Scaling

In [35]:
# IMPORTING THE STANDARD SCALER

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [36]:
# APPLYING THE STANDARD SCALER

X_train[:,3:] = scaler.fit_transform(X_train[:,3:])
X_test[:,3:] = scaler.fit_transform(X_test[:,3:])

In [37]:
# CHECKING X_train AFTER SCALING

X_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [38]:
# CHECKING X_test AFTER SCALING

X_test

array([[0.0, 1.0, 0.0, -1.0, -1.0],
       [1.0, 0.0, 0.0, 1.0, 1.0]], dtype=object)