<a href="https://colab.research.google.com/github/kv1792/MachineLearning/blob/main/data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [5]:
import numpy as np
# Numerical library
import matplotlib.pyplot as plt
# Visual plotting library
import pandas as pd
# Data and file processing library


## Importing the dataset

In [30]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Reading the csv file and fetching all the independent values i.e. values used for prediction in X
# And fetching all the dependent values, the predicted/outcome values in Y

In [31]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [35]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Using scikit learn library to transform the data and taking care of the missing values here
# Using the SimpleImputer package to replace the missing_values which are non numberical i.e. nan
# Using the strategy of MEAN, i.e. replacing the missing value with the average of other values

# After that, fitting the transformed values in the right place i.e. missing value columns
# Once fitting is done, saving the transformed value to the right dataset i.e. X's columns

In [36]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
X = np.array(ct.fit_transform(X))

# Here, we are using ColumnTransformer to convert the category column i.e. The countries into three different columns of each countries
# i.e. Spain, France and Germany. We do the encoding by using OneHotEncoder technic and we apply it to only the
# country column, hence [0]. The remainders is for mentioning to skip the other columns from encoding.
# the ColumnTransformer has its own method to fit and transform the values to the dataset X.
# the train model in future will require Numpy array datatype, hence casting the output of fit_transform.

In [39]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [41]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# Here to encode the labels/outcomes/dependent variables Y, since they are just in the form of Yes or No
# We can convert them into binary
# And to do that, we use LabelEncoder module of preprocessing package.
# the dependent dataset doesn't have to be of numpy array format hence, there is no casting required here.
# Simply saving the transformed value back to the Y dataset

In [42]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Here, we are splitting the training set and the testing set
# We are allocating 20% for the testing data set and 80% for training the model
# random_state is just used for setting the seek


In [55]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [57]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [56]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [58]:
print(y_test)

[0 1]


In [None]:
# We are applying the feature scaling after the splitting of the dataset because if we do it before the splitting
# then there will be information leakage of the features while splitting the dataset and that will influence
# the splitting of the dataset into unfair/inappropriate manner.

## Feature Scaling

In [62]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

# Here we are applying the standardization approach of feature scaling which basically works for almost all
# kinds of models. It scales the features in the range of -3 to +3
# We fit and transform the features to the training X dataset and then apply the same scalar and transform the
# X test dataset.
# This basically scales the test data with the same range of scale and keeps the training and test data consistent for the model to work on.

In [63]:
print(X_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [64]:
print(X_test)

[[0.0 1.0 0.0 -1.0000000000000002 -1.0000000000000002]
 [1.0 0.0 0.0 0.9999999999999999 0.9999999999999997]]
