# Data Preprocessing

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as  plt

### Import dataset

In [None]:
dataset = pd.read_csv("Data.csv")
X = dataset.iloc[:, :-1].values # X repesents all independent variable vectors
y = dataset.iloc[:, -1].values  # y represents dependent variable vector

In [None]:
print(X)

In [None]:
print(y)

### Handle missing data

In [None]:
# import simple imputer to calculate missing value
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:,1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

### Encoding categorical data

In [None]:
# import two classes for this case
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])] , remainder="passthrough")
X = np.array(ct.fit_transform(X)) # convert the output into numpy array to fit the machine learning

In [None]:
print(X)

### Encoding the dependent Variable

In [None]:
# import label encoder to change into 0 and 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # we don't use np array here as it is dependent variable vector

In [None]:
print(y)

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

### Feature Scaling

In [None]:
# feature scalling should be applied after the split to avoid information leakage. 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [None]:
print(X_train)

In [None]:
print(X_test)