# Data Preprocessing
1. Import Dataset
2. Take care of Missing Data
3. Encode Categorical Data
4. Split Dataset into Training and Test Sets
5. Feature Scaling

## 1. Import Dataset

In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv('../datasets/Preprocessing.csv')

In [None]:
dataset

Separate independent 'X' and dependent 'y' variables

In [None]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

In [None]:
X

In [None]:
y

## 2. Take care of Missing Data
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

- The **mean** of a set of numbers is the sum of all the numbers divided by the cardinality.
- The **median** of a set of numbers is the middle number, when the set is organized in ascending or descending order (and, when the set has an even cardinality, the mean of the middle two numbers).

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X, sep = "\n")

## 3. Encode Categorical Data

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

Encoding the Independent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
X

Encoding the Dependent Variable

In [None]:
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

## 4. Split Dataset into Training and Test Sets
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print('X_train', X_train, sep = "\n")
print('X_test', X_test, sep = "\n")
print('y_train', y_train, sep = "\n")
print('y_test', y_test, sep = "\n")

## 5. Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train[:, 1:3] = sc_X.fit_transform(X_train[:, 1:3])
X_test[:, 1:3] = sc_X.transform(X_test[:, 1:3])

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test