<a href="https://colab.research.google.com/github/jai-sundaram/ml_tutorial/blob/main/my_data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [1]:
##numpy helps us work with arrays
import numpy as np
#matplotlib helps us plot numerous charts, particularly pylpot which allows us to plot nice charts
import matplotlib.pyplot as plt
#pandas helps import the dataset, create the matrix, and independent variable vector
import pandas as pd


#to add the csv to the google colab notebook, click on the Files folder, click on the Upload button, and then upload the file

## Importing the dataset

In [2]:
#reading the values of the data set and creating a data frame
#make sure the path for the csv is correct
#find the csv in the folder structure, and click on copy path
dataset = pd.read_csv('/content/Data.csv')
#creating two additional entities
#creating a matrix of features and a dependent variable vector

#the features are the columns you are using to predict the dependent variable
#the dependent variable is the last column, the thing you are predicting
#in other terms, the features are the independent variables

##in this case, the country, age, and salary are the features/dependent variabels
##the fact if they purchased or not is the dependent variable

#formatting wise, most datasets will have the features/dependent variables in the first few columns, and the dependent variable in the last column
#to get the create the matrix, using the iloc method
#using the indexes of the columns
#getting all the rows, so just say ":", which is everything, since there is no specified range
#getting the first three columns, so basically getting all rows except the last one
#the last one is -1, so just do :-1
#including everything in the lower bound, excluding the upper bound which is -1, or the last value
#adding .values, means that we are just simply taking the values
x = dataset.iloc[:,:-1].values

#doing the same for the dependent variable vector
#only need to get one column, so not a range
#just -1
y= dataset.iloc[:, -1].values

In [3]:
#testing to make sure we correctly created the matrix and vector
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [5]:
#one way of dealing with missing data is to simply ignore it - this works if it is a large dataset and there are only a few things missing

#however, if this is not a case, you can simply replace the missing entry with the average in that column

# to do so, we will be using the scikit-learn library
# in particular, we will be using the SimpleImputer class

#first, we need to import it

from sklearn.impute import SimpleImputer

#create an object of it
#the first argument is the thing we are replacing, which are the empty/missing values
#the second argument is the thing we are replacing it with, which in this case is the meann
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')

#use the fit method to connect the imputer to the matrix of features
#it expects all the columns of the matrixes with numerical values, ONLY the ones with numerical values no string values
#for only specify the age and salary columns as those are the only numerical
#remember the upper bound is excluded, so it should be 3
imputer.fit(x[:, 1:3])
#use the transform method which will apply the transformation, which in this case is the replacement of the missing values
#for the arguments do the same thing, just input the columns where some entries need to be replaced, which is again 2 and 3
#the transform method actually returns the updated matrix, so just directly save it into those two columns
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [None]:
#testing
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [6]:
#need to turn categorical variables (strings) into numbers
#One Hot Encoding helps us do this
#In this case, One Hot Encoding will help us turn the country column into three different columns
#there are three different countries/categories
#One Hot Encoding creates a binary vector for each country
#France would have the vector 100
#Spain would have the vector 010
#Germany would have the vector 001
#no numerical order between the three countries

#eventually we will have to replace the Purchased? column as well because those are also categorical values
#before that tho, lets encode the Country column which is the independent variable that is categorical

### Encoding the Independent Variable

In [7]:
#we need to use two classes
#The ColumnTransform class and the OneHotEncoder class
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#first creating an object of the ColumnTransformer class
#the first argument is what type of transformation we want to do and which indexes of the columns we want to transform
#the second argument is the remainder, basically if we want to keep the columns that are not being encoded or not
# need to mention what we are doing (encoding), the type of encoding which in this case is One Hot Encoding, and to which columns
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder = "passthrough")
#need to connect it to the matrix
#can just use one method to both fit and transform
#for training, this matrix of features is expected to be in the form of a numpy array
#need to force it to be in the form of a numpy array, so use numpy's array method
#save the resulting matrix to the current one, basically updating it
x = np.array(ct.fit_transform(x))


### Encoding the Dependent Variable

In [8]:
#need to encode the dependent variable
#will be using the LabelEncoder class
from sklearn.preprocessing import LabelEncoder
#no arguments
le = LabelEncoder()
y = le.fit_transform(y)
#converts the Yes/Nos to numerical values
#dont need a numpy array, becuse this is a dependent variable vector which is expected by the future machine learning models

In [None]:
#testing
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [9]:
#you first split the dataset into the training set and the test set
#the training set is used to train the machine learning model on existing observations
#the test set is used to evaluate the performance of the model on new observations
#feature scaling consists of scaling your features to make sure they all take values in the same scale
#dont want one feature to dominate another
#we want to do feature scaling after splitting the data so that we dont accidentally touch the test set's data


#Splitting the dataset
#using train-test split function from the model selectionmodule in scikitlearn
#this will create 4 seperate sets
#there will be two pairs of matrix of features and the dependnet variable vector for the both the training set and test set
#will get:
#xtrain - matrix of features for the training set
#xtest - matrix of features for the test set
#ytrain - dependent variable vector for training set
#ytest - dependent variable vector for test set

#future ml models will expect this format as input
#expects xtrain and ytrain for training
#expects xtest and ytest for testing
from sklearn.model_selection import train_test_split
#first parameter is the matrix of features
#second parameter is the dependent variable vector
#third parameter is the split size of the test set
#need more in the training set and less in the test set, so the model has more chance to understand the correlation
#recommend split: 80% training, 20% testing split
#fourth parameter is the random_set
#for teaching purposes, we want to make sure the random factors are the same, so just set it equal to 1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [None]:
#testing
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
#testing
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
#testing
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
#testing
print(y_test)

[0 1]


## Feature Scaling

In [12]:
#feature scaling is done so that some features are not dominated by the other features
#do not have to apply feature scaling for all the models, just for
#two types of feature scaling
#standardization and normalizationn
#standardization - subtracting each value of the feature by the mean of the all the values in the feature and then dividing by the standard deviation of the feature
#all values will be around -3 and +3
#normalization - subtracting each value by the min value of the feature dividied by (max-min)
#all values will be between 0 and 1
#standardization is recommended overall
#normalization is usually only recommended for normal distribution
#standardization works well all the time, since it works all the time, it will be more recommended
#we have to apply it on two matrixes of features, xtrain and xtest seperately
#scaler will be fixed to xtrain then we will transform xtrain, then we will apply feature scaling to xtest
#not allowed to fit the feature scaling tool on xtest
#first apply the normal standardization formula on xtrain
#to apply feature scaling on xtest, get the standard deviation and mean of xtrain
#then apply the formula to xtest using these mean and standardization values

#using the StandardScaler class from the scikit-learn library
#will perform standardization on the matrix of features in the training set, and matrix of features of the test set
from sklearn.preprocessing import StandardScaler
#no parameters needed
sc = StandardScaler()
#we do not need to apply standardixation on the dummy variables, which are the variables created from encoding the categorical variables
#since they already take values between -3 and 3, as they are either 1 or 0, there is nothing extra to be done with standaridzation
#standardization will make it worse because you will lose the interpration of the variables
#first we will fit the scaler on the training set
#remember we will not be standardizing the dummy variables
#so only the age and salary columns
#taking all columns 3 and up
#so basically only the age and salary
#(remember this is the dependent variable vector)
#we're going to compute the mean and the standard deviation of the training set first using the fit method
#then use the transform method to actually apply the standardization formula
#however we can use the fit_transform method to do both
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])

#features of the test set will be standardized using the same scalar used in the training set
#we will only be using the transform method
#if we applied fit_transform we would get a new scalar
#keep the scalar from the training set, just apply the formula
x_test[:, 3:] = sc.transform(x_test[:, 3:])


In [13]:
#testing
print(x_train)

[[0.0 0.0 1.0 -0.1915918438457856 -1.0781259408412427]
 [0.0 1.0 0.0 -0.014117293757057902 -0.07013167641635401]
 [1.0 0.0 0.0 0.5667085065333239 0.6335624327104546]
 [0.0 0.0 1.0 -0.3045301939022488 -0.30786617274297895]
 [0.0 0.0 1.0 -1.901801144700799 -1.4204636155515822]
 [1.0 0.0 0.0 1.1475343068237056 1.2326533634535488]
 [0.0 1.0 0.0 1.4379472069688966 1.5749910381638883]
 [1.0 0.0 0.0 -0.7401495441200352 -0.5646194287757336]]


In [14]:
#testing
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
