# Data Prepocessing

### Load Dataset

In [1]:
from sklearn.datasets import load_boston  # Boston House Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
boston = load_boston()
type(boston) 

sklearn.utils.Bunch

In [3]:
X = boston.data      # Features
Y = boston.target    # Actual Predictions

print(X.shape)
print(Y.shape)

(506, 13)
(506,)


In [4]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
pd.DataFrame(Y).head()

Unnamed: 0,0
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [6]:
# What are all the features?

print(boston.feature_names) 

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [7]:
# Have a deep dive in Boston Dataset

print(boston.DESCR) 

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Dealing with DataFrame

In [8]:
# Create a DataFrame

df = pd.DataFrame(X) 
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [9]:
# There is no columns/features name given to DataFrame. Doing it manually

df.columns = boston.feature_names # Features name 
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [10]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


###  Features Normalization

In [11]:
# Taking mean/average and standard deviation

mean = np.average(X, axis = 0) # Axis needed because there are more than one column
std = np.std(X, axis = 0) 


In [12]:
X = (X - mean)/std # Normalized!!

# How it is looking?
pd.DataFrame(X, columns = boston.feature_names).head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


# Linear Regression on Multiple Features

We will be doing calculations through loops rather than using vectorization as of now

In [13]:
X.shape, Y.shape

((506, 13), (506,))

In [14]:
# Adding one Column to Dataset := Bias Feature

Bias = np.ones((X.shape[0], 1))


In [15]:
# Horizontal column stacked into X

X = np.hstack((Bias, X))
print(X[:4, :4])

[[ 1.         -0.41978194  0.28482986 -1.2879095 ]
 [ 1.         -0.41733926 -0.48772236 -0.59338101]
 [ 1.         -0.41734159 -0.48772236 -0.59338101]
 [ 1.         -0.41675042 -0.48772236 -1.30687771]]


In [26]:
# x is any datarow out of whole Dataset

def Hypothesis(x, Theta):
    '''This function is to calculate hypothesis for a particular row'''
    
    Hypo = 0
    n = x.shape[0] # Number of features
    
    for j in range(0, n):
        Hypo += Theta[j] * x[j]
        
    return Hypo


In [27]:

def Error(X, Y, Theta):
    '''This function is to calculate mean error over whole DataSet for a particular set of theta'''
    
    error = 0
    m = X.shape[0] # Number of training example
    
    for i in range(m):
        
        y_predict = Hypothesis(X[i], Theta) # Hypothesis of ith row
        
        error += (y_predict - Y[i])**2  # Summing up error 
        
    return error/m # Mean Error of whole dataset


In [33]:

def Gradient(X, Theta, Y):
    '''Gradient is to calculate derivative of cost function'''
    
    m, n = X.shape
    
    grad = np.zeros((n,)) # Gradient for each theta
    
    for j in range(n): # Considering a feature
        for i in range(m): # Now, going through whole dataset
            
            y_output = Hypothesis(X[i], Theta) # Prediction for each row in Dataset
            
            grad[j] += (y_output - Y[i])*X[i][j]
            
        
    return grad/m
    

In [34]:

def GradientDescent(X, Y, LearningRate = 0.1, max_iter = 300):
    '''This function is to update Thetas accordingly'''
    
    m, n = X.shape
    
    theta = np.zeros((n, ))
    error_list = []
    
    for i in range(max_iter):
        
        error_list.append(Error(X, Y, theta)) # Error assicated with each theta
        
        grad = Gradient(X, theta, Y) # Gradient for a particular set of theta over all data
        
        for j in range(n): # Updating each theta inside looping
            theta[j] = theta[j] - grad[j] * LearningRate
            
            
    return error_list, theta
        

In [35]:
error_list, theta = GradientDescent(X, Y)