# Data Preprocessing Tools

## Importing the libraries

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [82]:
data = pd.read_csv("../Data.csv")
x = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [83]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [84]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

## Taking care of missing data

In [85]:
#Preenchendo o salário vazio com um valor médio

from sklearn.impute import SimpleImputer

#dizendo para o imputer quais são os valores nulos e a estrategia adotada

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

#para conectar o imputer ao dataset (so pode conectar valores numericos)
#Serve para varias colunas

imputer.fit(x[:,1:3])

#metodo transform faz a substituição dos valores sumidos pelas medias
#Ele retorna um resultado

x[:,1:3] = imputer.transform(x[:,1:3])

In [86]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding categorical data

### Encoding the Independent Variable

In [87]:
#OneHotEncoding transformara os países em categorias
# Se usarmos 0, 1, 2 para os países o modelo pode achar que há alguma relação entre os modelos
# Transformará os países em colunas

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])], remainder='passthrough')

#quando tem o metodo fit_transform() faz os dois ao mesmo tempo
#transformaremos o ct.fit_transform em um np.array pois isso facilita para modelos de machine learning

x = np.array(ct.fit_transform(x))

In [88]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [89]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


In [90]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into the Training set and Test set

In [91]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 1)

## Feature Scaling

In [92]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()


In [93]:
x_train[:,3:] = sc.fit_transform(x_train[:,3:])


In [94]:
x_train

array([[0.0, 0.0, 1.0, -0.19159184384578545, -1.0781259408412425],
       [0.0, 1.0, 0.0, -0.014117293757057777, -0.07013167641635372],
       [1.0, 0.0, 0.0, 0.566708506533324, 0.633562432710455],
       [0.0, 0.0, 1.0, -0.30453019390224867, -0.30786617274297867],
       [0.0, 0.0, 1.0, -1.9018011447007988, -1.420463615551582],
       [1.0, 0.0, 0.0, 1.1475343068237058, 1.232653363453549],
       [0.0, 1.0, 0.0, 1.4379472069688968, 1.5749910381638885],
       [1.0, 0.0, 0.0, -0.7401495441200351, -0.5646194287757332]],
      dtype=object)

In [95]:
x_test[:,3:] = sc.fit_transform(x_test[:,3:])

In [96]:
x_test

array([[0.0, 1.0, 0.0, -1.0, -1.0],
       [1.0, 0.0, 0.0, 1.0, 1.0]], dtype=object)