# Data Preprocessing Tools

## Importing the libraries

In [61]:
%config Completer.use_jedi = False
import numpy as np # allows to work with arrays
import matplotlib.pyplot as plt # plotting charts
import pandas as pd # import dataset and create feature matrix

## Importing the dataset

In [62]:
dataset = pd.read_csv("Data.csv")
# Separating featurs from dependent variable
# Features in X and dependent variable in Y
X = dataset.iloc[:,:-1].values # get all row, all columns but last one
Y = dataset.iloc[:,-1].values # all rows and the last column

In [63]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [64]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data
* For large data set you can drop that record
* Can replace missing value with the average of the entire column
* Germany salary is nan so replace it with good value 

In [65]:
from sklearn.impute import SimpleImputer
my_impute = SimpleImputer(missing_values= np.nan, strategy = 'mean')
my_impute.fit(X[:,1:3]) # fit only gets numerical value, here it looks for all missing values in 'age' and 'salary'
X[:,1:3] = my_impute.transform(X[:,1:3]) # Take the input of fit and give it as arg to transform, and update the values

In [66]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

* If we encode the country names in our case as numerical values, the model would thing there is a numerical order between these countries and interpret this order matters. We need to avoid this issue 
* One solution is using binary vector for these values: 100, 010, 001

In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Type of transformation is encoder and zero is the column number 
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],remainder='passthrough')
#column transformer has its own fit function that we can use
# the return value is not in np format so we need to convert it
X = np.array(ct.fit_transform(X))

In [68]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [69]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y) # we do not need to convert it cause it's a dependent varible array and does not
# need to be np array

In [70]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set
* We have to do feature scaling AFTER splitting the dataset into training set and test set. That's because feature scaling gets the mean and standard devication of features in order to perform the scaling and if we perform it before splitting, it would take the mean and SD of all the values including the ones in test set

In [71]:
from sklearn.model_selection import train_test_split
# Returns 4 values 
# The size of training set is larger than the test set
X_train, X_test, Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

In [72]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [73]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [74]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [75]:
print(Y_test)

[0 1]


## Feature Scaling
It's a technique used to prevent some features get dominated by other features in training(for some machine learning models not all of them)

* The slide for standardization and normalizatoin.
* normalization gives value between 0 and 1 but standardization gives value between -3 to +3
* normalization is better in some cases but standardization works almost always
* Normalization: $$x_{norm} = \frac{x - min(x)}{max(x)-min(x)}$$ 
* Standardization: $$ x_{stand} = \frac{x - mean(x)}{standard\_deviation(x)} $$
* Video number 24 so informative about feature scaling and splitting 

In [79]:
from sklearn.preprocessing import StandardScaler
sk = StandardScaler()
X_train[:,3:] = sk.fit_transform(X_train[:,3:])
X_test[:,3:] = sk.transform(X_test[:,3:]) # Important: we do not use fit here

In [80]:
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [81]:
print(X_test)

[[0.0 1.0 0.0 -1.0000000000000002 -1.0000000000000002]
 [1.0 0.0 0.0 0.9999999999999999 0.9999999999999997]]
