# Data Preparation

In [1]:
# Data Preparation and Imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# Loading the dataset
dataset = pd.read_csv("loans.csv")
# to view data
dataset

Unnamed: 0,City,Age,Salary,Approved
0,Apple Valley,25.0,65000.0,Yes
1,Maplewood,30.0,81000.0,No
2,Eagan,33.0,,Yes
3,Apple Valley,39.0,100000.0,No
4,Maplewood,28.0,91000.0,Yes
5,Eagan,,66000.0,No
6,Apple Valley,40.0,98000.0,Yes
7,Maplewood,34.0,86000.0,Yes
8,Eagan,25.0,70000.0,No
9,Maplewood,24.0,62000.0,Yes


In [2]:
# using head  or tail
dataset.tail()

Unnamed: 0,City,Age,Salary,Approved
5,Eagan,,66000.0,No
6,Apple Valley,40.0,98000.0,Yes
7,Maplewood,34.0,86000.0,Yes
8,Eagan,25.0,70000.0,No
9,Maplewood,24.0,62000.0,Yes


In [3]:
# Dividing X and y arrays
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,3].values

# Handling missing values based on data
- impute val if it is numeric column(mean)
- predict making it as a dependent variable
- drop entire row or column if value missing

Business rule customizations:

•One of the easiest ways to deal with missing data is to simply remove thecorresponding features (columns) or samples (rows) from the dataset entirely; rows with missing values can be easily dropped via the dropnamethod:

-- df.dropna(axis=0)

Similarly, we can drop columns that have at least one NaNin any row by setting the axis argument to 

-- df.dropna(axis=1)

•The dropnamethod supports several additional parameters that can come in handy:
#only drop rows where all columns are NaN
-- df.dropna(how='all’)

#Keep only the rows with at least 2 non-NaNvalues.
-- df.dropna(thresh=2)

#only drop rows where NaNappear in specific columns (here: 'C')
-- df.dropna(subset=['C'])

In [4]:
#Dealing with missing values Numeric - mean imputation
#From the scikit.imputelibrary we first import the SimpleImputerclass
from sklearn.preprocessing import Imputer
#Next we define an object of the SimpleImputerclass by looking at the docstring (use Shift+Tab)
imputer = Imputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,[1,2]])
X[:,1:3]= imputer.transform(X[:,1:3])
print(X)

[['Apple Valley' 25.0 65000.0]
 ['Maplewood' 30.0 81000.0]
 ['Eagan' 33.0 79888.88888888889]
 ['Apple Valley' 39.0 100000.0]
 ['Maplewood' 28.0 91000.0]
 ['Eagan' 30.88888888888889 66000.0]
 ['Apple Valley' 40.0 98000.0]
 ['Maplewood' 34.0 86000.0]
 ['Eagan' 25.0 70000.0]
 ['Maplewood' 24.0 62000.0]]


### Dealing with categorical variables
• ML models are based on mathematical equations. 
• We need to somehow code the categorical variables as numbers.

In [5]:
#Dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
labelencoder_X= LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
print(X)

# This won't work as the numbers implying order

[[0 25.0 65000.0]
 [2 30.0 81000.0]
 [1 33.0 79888.88888888889]
 [0 39.0 100000.0]
 [2 28.0 91000.0]
 [1 30.88888888888889 66000.0]
 [0 40.0 98000.0]
 [2 34.0 86000.0]
 [1 25.0 70000.0]
 [2 24.0 62000.0]]


In [6]:
#Dealing with categorical variables
##From the scikit.preprocessinglibrary we first import few classes
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X= LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
onehotencoder= OneHotEncoder(categorical_features=[0])
X = onehotencoder.fit_transform(X).toarray()
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 2.50000000e+01
  6.50000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.00000000e+01
  8.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.30000000e+01
  7.98888889e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.90000000e+01
  1.00000000e+05]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.80000000e+01
  9.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.08888889e+01
  6.60000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.00000000e+01
  9.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.40000000e+01
  8.60000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 2.50000000e+01
  7.00000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.40000000e+01
  6.20000000e+04]]


In [7]:
#Dependent variable
labelencoder_y= LabelEncoder()
y = labelencoder_y.fit_transform(y)

## Training, testing split and third part called validation(Evaluation)

Test/validate and Choose model with less VE and less complexity. 

- Underfit – TE high + VE high 
- Bestfit – TE high or low + VE low 
- Overfit – TE low + VE high

In [8]:
#Splitting the data into Training Set and Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=0)

X_train.shape
X_test.shape
y_train.shape
y_test.shape

(2,)

# Normalization 
Different attributes are measured on different scales - normalize it to same scale

Two popular approaches:

- scale using min max to range[0,1]
- Transforms the data to have zero mean and unit variance (z-score)

In [9]:
#Normalizing the features
from sklearn.preprocessing import StandardScaler
sc_X= StandardScaler()
X_train= sc_X.fit_transform(X_train)  # fit stores fit values for that column
X_test= sc_X.transform(X_test) # uses fit values from sc_X to transform test data