In [None]:
# Linear Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math

In [3]:
df = pd.read_csv(r"C:\Users\keval\Desktop\Keval\Northeastern\ML\housing.csv")

In [4]:
df.describe()

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
count,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0,505.0
mean,3.620663,11.350495,11.154257,0.069307,0.554728,6.284059,68.581584,3.79446,9.566337,408.459406,18.461782,356.594376,12.668257,22.529901
std,8.608569,23.343704,6.855868,0.254227,0.11599,0.703195,28.176371,2.107761,8.707553,168.629992,2.16252,91.367787,7.13995,9.205991
min,0.00906,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08221,0.0,5.19,0.0,0.449,5.885,45.0,2.1,4.0,279.0,17.4,375.33,7.01,17.0
50%,0.25915,0.0,9.69,0.0,0.538,6.208,77.7,3.1992,5.0,330.0,19.1,391.43,11.38,21.2
75%,3.6782,12.5,18.1,0.0,0.624,6.625,94.1,5.2119,24.0,666.0,20.2,396.21,16.96,25.0
max,88.976,100.0,27.74,1.0,0.871,8.78,100.0,12.127,24.0,711.0,22.0,396.9,37.97,50.0


In [1]:
class LinearRegression:
    def __init__(self, X, y, learningRate, tolerance, maxIteration, gd= False) -> None:
        self.X = X 
        self.y = y
        self.tolearnce = tolerance
        self.learningRate = learningRate
        self.maxIteration = maxIteration
        self.gd = gd
    
    def trainTestSplit(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y,
                                                           test_size = 0.3, random_state = 0)
        return X_train, X_test, y_train, y_test
    
    # This address the problem of linear regression where it always passes from centre
    # Here we are adding a column of ones as intercept
    
    def add_X0(self, X):
        return np.column_stack([np.ones([X.shape[0], 1]), X])
    
    def normalize(self, X):
        mean = np.mean(X, 0)
        sd = np.std(X, 0)
        X_norm = (X - mean)/sd
        
        X_norm = self.add_X0(X_norm)
        return X_norm, mean, sd
    
    def normalizeTestData(self, X, trainMean, trainSd) : 
        X_norm = (X - trainMean)/ trainSd
        X_norm = self.add_X0(X_norm)
        return X_norm
    
    
    # Checking the rank of matrix using single value decomposition
    def rank(self, X, eps = 1e-12):
        u, s, vh = np.linalg.svd(X)
        # mean, sd, v transpose
        return len([x for x in abs(x) > eps])
    
    def checkMatrix(self, X):
        X_rank = np.linalg.matrix_rank(X)
        
        if X_rank == min(X.shape[0], X.shape[1]):
            self.fullRank = True
            print("Data is full rank")
        else:
            self.fullRank = False
            print("Data is not Full Rank")

    def checkInvertibility(self, X):
        if X.shape[0] < X.shape[1]:
            self.lowRank = True
            print("Data is Low Rank")
        else:
            self.lowRank = False
            print("Data is not Low Rank")
            
    def closedFormSolution(self, X, y):
        w = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
        return w
    
    def gradientDescent(self, X, y):
        errorSequences = []
        
        last = float('inf')
        
        for t in tqdm(range(self.maxIteration)):
            self.w = self.w - self.learningRate * self.costDerivative(X, y)
            cur = self.sse(X,y)
            diff = last - cur
            last = cur
            
            errorSequences.append(cur)
            
            if dif < self.tolerance:
                print("The model stopped - no further improvment")
                break
                #count = 0. Increment coount to check error

        
    def sse(self, X, y):
        y_hat = self.predict(X)
        return ((y_hat - y)**2).sum()
    
    def predict(self, X):
        return X.dot(self.w)
    
    def costFunction(self, X, y):
        return self.sse(X, y)/2
    
    def CostDerivative(self, X, y):
        y_hat = self.predict(X)
        return(y_hat - y).dot(X)
    
    def fit(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = self.trainTestSplit()
        
        self.X_train, self.mean, self.sd = self.normalize(self.X_train)
        self.X_test = self.normalizeTestData(self.X_test, self.mean, self.sd)
        
        self.checkMatrix(self.X_train)
        self.checkInvertibility(self.X_train)
        
        if self.fullRank and not self.lowRank and not self.gd:
            print("Solving using closed form Solution")
            self.w = self.closedFormSolution(self.X_train, self.y_train)
            
        else:
            print("Solving using gradient descent")
            self.w = np.ones(self.X_train.shape[1], dtype = np.float64) * 0
            self.gradientDescent(self.X_train, self.y_train)
            
            print(self.w)
        
    
    
        

In [None]:
# closedFormSolution = Normal Equation

In [23]:
regression = LinearRegression(df.values[:,0:-1], df.values[:, -1], 
                             learningRate = 0.000001,
                             tolerance = 0.000005, maxIteration = 100000, gd = True )

In [24]:
regression.fit()

  0%|                                                                                       | 0/100000 [00:00<?, ?it/s]

Data is full rank
Data is not Low Rank
Solving using gradient descent





AttributeError: 'LinearRegression' object has no attribute 'costDerivative'

In [None]:
regression.predict(regression.X_test)

In [4]:
lambdas = np.arange(-10, 0.2, step=0.1)
print(lambdas)

[-1.00000000e+01 -9.90000000e+00 -9.80000000e+00 -9.70000000e+00
 -9.60000000e+00 -9.50000000e+00 -9.40000000e+00 -9.30000000e+00
 -9.20000000e+00 -9.10000000e+00 -9.00000000e+00 -8.90000000e+00
 -8.80000000e+00 -8.70000000e+00 -8.60000000e+00 -8.50000000e+00
 -8.40000000e+00 -8.30000000e+00 -8.20000000e+00 -8.10000000e+00
 -8.00000000e+00 -7.90000000e+00 -7.80000000e+00 -7.70000000e+00
 -7.60000000e+00 -7.50000000e+00 -7.40000000e+00 -7.30000000e+00
 -7.20000000e+00 -7.10000000e+00 -7.00000000e+00 -6.90000000e+00
 -6.80000000e+00 -6.70000000e+00 -6.60000000e+00 -6.50000000e+00
 -6.40000000e+00 -6.30000000e+00 -6.20000000e+00 -6.10000000e+00
 -6.00000000e+00 -5.90000000e+00 -5.80000000e+00 -5.70000000e+00
 -5.60000000e+00 -5.50000000e+00 -5.40000000e+00 -5.30000000e+00
 -5.20000000e+00 -5.10000000e+00 -5.00000000e+00 -4.90000000e+00
 -4.80000000e+00 -4.70000000e+00 -4.60000000e+00 -4.50000000e+00
 -4.40000000e+00 -4.30000000e+00 -4.20000000e+00 -4.10000000e+00
 -4.00000000e+00 -3.90000

In [14]:
df[1:1]

Unnamed: 0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
