In [4]:
import pandas as pd
import numpy as np 
from tqdm import tqdm 

import math
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
def dataset_reader(filename):
    """
    @filename: ste, name of dataset to load
    @does: reads the dataset from directory and coverts to matrix
    @returnL numpy matrix
    """
    return np.array(pd.read_csv(filename, header = None),dtype = np.float)

In [10]:
df = pd.read_csv("housing.csv")

In [11]:
df

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
1,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
501,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
502,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
503,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [12]:
df.corr()

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
0.00632,1.0,-0.200283,0.406251,-0.056132,0.420934,-0.218978,0.352701,-0.379626,0.625395,0.582568,0.289393,-0.384838,0.455328,-0.388249
18.0,-0.200283,1.0,-0.534022,-0.04255,-0.516574,0.311835,-0.569524,0.664396,-0.311717,-0.314351,-0.391713,0.175319,-0.412894,0.360393
2.31,0.406251,-0.534022,1.0,0.06235,0.764556,-0.39133,0.645543,-0.708848,0.594167,0.720561,0.380955,-0.356506,0.602737,-0.484126
0.0,-0.056132,-0.04255,0.06235,1.0,0.091134,0.091497,0.086461,-0.099109,-0.007907,-0.035965,-0.12257,0.04904,-0.054576,0.175364
0.538,0.420934,-0.516574,0.764556,0.091134,1.0,-0.302127,0.731461,-0.76922,0.611758,0.668141,0.188918,-0.380006,0.591262,-0.427295
6.575,-0.218978,0.311835,-0.39133,0.091497,-0.302127,1.0,-0.240211,0.20517,-0.209277,-0.29168,-0.355116,0.127754,-0.613734,0.695365
65.2,0.352701,-0.569524,0.645543,0.086461,0.731461,-0.240211,1.0,-0.747872,0.456232,0.506527,0.261724,-0.273486,0.602782,-0.376932
4.09,-0.379626,0.664396,-0.708848,-0.099109,-0.76922,0.20517,-0.747872,1.0,-0.494797,-0.534492,-0.23256,0.291451,-0.497276,0.249895
1.0,0.625395,-0.311717,0.594167,-0.007907,0.611758,-0.209277,0.456232,-0.494797,1.0,0.910202,0.463322,-0.444065,0.487608,-0.38169
296.0,0.582568,-0.314351,0.720561,-0.035965,0.668141,-0.29168,0.506527,-0.534492,0.910202,1.0,0.4601,-0.441505,0.543435,-0.468543


In [13]:
df.columns

Index(['0.00632', '18', '2.31', '0', '0.538', '6.575', '65.2', '4.09', '1',
       '296', '15.3', '396.9', '4.98', '24'],
      dtype='object')

In [27]:
class LinearRegression:
    def __init__(self, X, y, learningRate, maxIterations, tolerance, gd = False):
        np.random.RandomState(10)
        self.X = X
        self.y = y
        self.learningRate = learningRate
        self.maxIterations = maxIterations
        self.tolerance = tolerance
        self.gd = gd
        
    def splitToTrainTest(self):
        
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0)
        return X_train, X_test, y_train, y_test 
    
    def add_x0(self, X):
        # Adding column of 1 as y intercept to the matrix
        return np.column_stack([np.ones([X.shape[0], 1]) , X])
    
    def normalize(self, X):
        
        mean = np.mean(X , 0)
        std = np.std(X, 0)
        
        X_norm = (X - mean) / std        
        X_norm = self.add_x0(X_norm) # Adding 1s after normalizing 
        
        return X_norm, mean, std
    
    def normalizeTestData(self, X, train_mean, train_std):
        X_norm = (X - train_mean) / train_std
        X_norm = self.add_x0(X_norm)
        
        return X_norm
    
    
    def rank(self, X, eps = 1e-12):
        # eps is epsilon. THis is to check all the diagonal values in the matrix to make sure it is not zero 
        # Larger eps value, lower the rank
        u, s, vh = np.linalg.svd(X)
        return len([x for x in s if abs(x) > eps])
    
    def checkMatrix(self, X):
        x_rank = np.linalg.matrix_rank(X)
        
        # checking if its Full rank
        if x_rank == min(X.shape[0], X.shape[1]):
            print("Matrix is full rank")
            self.fullRank = True
        else:
            print("Matrix is not full rank")
            self.fullRank = False
            
        
    def checkInvertibility(self, X):
        # if n > d then its low rank
        if X.shape[0] > X.shape[1]:
            self.lowRank = True
            print("data is Low Rank")
        else: 
            self.lowRank = False
            print("data is not Low Rank")
            
    def closedFormSolution(self, X, y):
        # theta = (Xt . X)inv . Xt . y
        w = np.linalg.inv(X.T.dot(X).dot(X.T).dot(y))
        return w
    
    def gradientDescent(self, X, y):
        error_sequence = []
        
        last = float('inf')
        
        for i in tqdm(range(self.maxIterations)):            
            self.w = self.w - self.learningRate * self.costDerivatives(X, y)
            
            # setting current error using standard sqd error 
            
            cur = self.sse(X, y)
            dif = last - cur 
            last = cur
            
            error_sequence.append(cur)
            
            if dif < self.tolerance:
                print("Model Stopped")
                break
                
            # WE can add condition to check current w (thetha) with the previous value of w and if it is below tolerance then we can stop 
            # This solution is easier
            
    def predict(self, X):
        
        return X.dot(self.w)
    
    def sse(self, X, y):
        
        y_hat = self.predict(X)
        return ((y_hat - y) ** 2).sum()
        
    def costFunction(self, X, y):
        
        return self.sse(X, y) / 2
    
    def costDerivatives(self, X, y):
        # 
        y_hat = self.predict(X)        
        return (y_hat - y).dot(X)
    
    def runModel(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.splitToTrainTest()
        
        self.X_train, self.mean, self.std = self.normalize(self.X_train)
        self.X_test = self.normalizeTestData(self.X_test, self.mean, self.std)
        
        self.checkMatrix(self.X_train)
        self.checkInvertibility(self.X_train)
        
        if self.fullRank and not self.lowRank and self.X_train.shape[0] < 10000 and not self.gd: 
            
            print("Solving using closed form solution")
            self.w = self.closedFormSolution(self.X_train, self.y_train)
        
        else:
            
            print("Solving using gradient descent")
            self.w = np.ones(self.X_train.shape[1], dtype = np.float64) * 0
            self.gradientDescent(self.X_train, self.y_train)
            
            print(self.w)
            

In [28]:
regression = LinearRegression(df.values[:, 0:-1], df.values[:,-1], maxIterations=1000,
                    learningRate=0.00001, tolerance=0.0000001, gd=True)

In [29]:
regression.runModel()

100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 18456.46it/s]

Matrix is full rank
data is Low Rank
Solving using gradient descent
[ 2.22660997e+01 -6.91152511e-01  5.70453434e-01 -2.32485602e-01
  8.03819341e-01 -8.11656530e-01  3.17210361e+00  1.63935674e-02
 -1.66621383e+00  5.85293956e-01 -3.57727347e-01 -2.08078459e+00
  6.55888273e-01 -3.39105590e+00]





In [30]:
regression = LinearRegression(df.values[:, 0:-1], df.values[:,-1], maxIterations=100,
                    learningRate=0.00001, tolerance=0.0000001, gd=False)

In [31]:
regression.runModel()

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12297.13it/s]

Matrix is full rank
data is Low Rank
Solving using gradient descent
[ 6.83115257 -0.50026466  0.47358678 -0.55284998  0.49126803 -0.43650848
  1.49935507 -0.36253545 -0.01459749 -0.33361321 -0.52540914 -1.04735738
  0.44381934 -1.40590858]





Understand the relationship between error rate and learning rate to get a smooth gradient