In [124]:
import numpy as np
import pandas as pd

In [125]:
def calculate_Rsquare(y,prediction):
    return  1-(np.sum((y-prediction)**2)/np.sum((y-np.mean(y))**2))

In [126]:
# Define Linear Regression model as a class

class LinerarRegression:

    def __init__(self,x, y, inital_w, obj_fun, iter = 10000, step = 0.7):
        # Initializing no. of iterations and intial guess of weights
        self.iters = iter
        self.step = step
        self.w = inital_w
        self.x = x
        self.y = y
        self.obj = obj_fun
        

    def __sse(self,w):
        # A function to calculate sum of squared errors
        term1 = self.x.dot(w) - self.y
        term2 = self.x.dot(w) - self.y
        sse = np.dot(term1.T,term2)
        #print(sse[0][0])
        return sse[0][0]
    
    def __eucledian(self,w):
        # A function to calculate sum of eucledian distances
        d = self.x.dot(w) - self.y
        #print(d.shape,self.x.shape)
        sed = np.sum(d)**2
        #print("SED: ",sed)
        return sed/(np.linalg.norm(w))**2
    
    def __lineSearch(self,w,g,cost, max_iter):

        tolerance = 0.0001
        # Setting learning rate to maximum of 1
        lr = 1
        # Setting the step size to 0.7 as done in Algorithm 1 of lecture notes
        step = self.step
        iter = 0
        w_temp = w

        if self.obj == "SSE":
            error = self.__sse(self.w)
        elif self.obj == "EUC":
            error = self.__eucledian(self.w)
            
        while iter < max_iter:
            w_temp = w - (lr*g)
            
            if cost(w_temp) < error - tolerance:
                break

            lr = lr * step
            iter += 1

        if iter == max_iter:
            return  0
        
        return lr
            

    def fit(self):
        tolerance = 0.0001
        # initializing error to infinity
        self.error = float('inf')
        step = 0

        if self.obj == "SSE":
            obj = self.__sse
        elif self.obj == "EUC":
            obj = self.__eucledian

        while  abs(obj(self.w) - self.error) > tolerance  and step < self.iters:
            self.error = obj(self.w)
            if self.obj == "SSE":
                g = 2 * self.x.T.dot(self.x.dot(self.w) - self.y)
            elif self.obj == "EUC":
                g = (2/(np.linalg.norm(self.w))**2) * (self.x.dot(self.w) - self.y).T.dot(self.x)
                g = g.T / (np.linalg.norm(self.w))**2
                
            lr = self.__lineSearch(self.w, g, obj,2000000)
            self.w = self.w - (lr*g)
            step += 1

            print("g: ",g,"parameters: ",self.w, "no.of iterations: ", step, "Error: ", self.error)
        
    def ols(self):

        # Calculate optimim weights / Ordinary Least Squared
        first = self.x.T.dot(self.x)
        first = np.linalg.inv(first)
        self.w = first.dot(self.x.T.dot(self.y))
    
    def predict(self,x):
        y_pred = np.dot(x,self.w)
        return y_pred

# Get initial guess for parameter

def get_inital_w(x):
    w = np.random.rand(len(x[0]), 1)
    return w


Datset 1:

To predict city-cycle fuel consumption in miles per gallon, terms of 3 multivalued discrete and 5 continuous attributes.

url: https://archive.ics.uci.edu/dataset/9/auto+mpg

Variables

    1. mpg:           continuous                            Target

    2. cylinders:     multi-valued discrete                 Feature

    3. displacement:  continuous                            Feature

    4. horsepower:    continuous                            Feature

    5. weight:        continuous                            Feature

    6. acceleration:  continuous                            Feature

    7. model year:    multi-valued discrete                 Feature            

    8. origin:        multi-valued discrete                 Feature  
                           
    9. car name:      string (unique for each instance)     Feature                         

In [127]:
# Importing Dataset from UCI Machine Learning Repository

from ucimlrepo import fetch_ucirepo 
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
print(auto_mpg.metadata) 
  
# variable information 
print(auto_mpg.variables)

# Handling the missing values -> Here we simply drop the rows that contain Nan values
empty_rows = X[X["horsepower"].isna()].index
empty_rows = empty_rows.tolist()

X = X.drop(empty_rows)
y = y.drop(empty_rows)

print(X.isna().sum(), y.isna().sum())

# converting pandas df to numpy array
X = X.to_numpy()
y = y.to_numpy()
# making first column of X as 1, to include bias
X = np.c_[np.ones((len(X),1)),X]


{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for th

In [128]:
inital_w = get_inital_w(X)

print("Inital guess for W: ",inital_w)

auto_mpg_reg_SSE = LinerarRegression(X,y,inital_w,"SSE")
auto_mpg_reg_SSE.fit()


auto_mpg_reg_SED = LinerarRegression(X,y,inital_w,"EUC")
auto_mpg_reg_SED.fit()


auto_mpg_reg_ML = LinerarRegression(X,y,inital_w,"OLS")
auto_mpg_reg_ML.ols()

prediction_SSE = auto_mpg_reg_SSE.predict(X)
prediction_SED = auto_mpg_reg_SED.predict(X)
prediction_ML = auto_mpg_reg_ML.predict(X)


Inital guess for W:  [[0.22295877]
 [0.44627215]
 [0.71556105]
 [0.03240118]
 [0.85653153]
 [0.94936553]
 [0.23323633]
 [0.95212719]]
g:  [[2.08168427e+06]
 [4.64592087e+08]
 [1.23294986e+07]
 [2.37841830e+08]
 [6.71514600e+09]
 [3.16392970e+07]
 [1.57457211e+08]
 [2.99301852e+06]] parameters:  [[ 0.22244057]
 [ 0.33062091]
 [ 0.71249186]
 [-0.02680496]
 [-0.81507455]
 [ 0.94148954]
 [ 0.19404039]
 [ 0.95138214]] no.of iterations:  1 Error:  3000419257.4251623
g:  [[-1.84547836e+06]
 [-4.08452708e+08]
 [-1.08774418e+07]
 [-2.09768459e+08]
 [-5.93020916e+09]
 [-2.80996757e+07]
 [-1.39647447e+08]
 [-2.66972712e+06]] parameters:  [[0.22289997]
 [0.43229733]
 [0.71519959]
 [0.02541285]
 [0.66113664]
 [0.94848441]
 [0.22880293]
 [0.95204671]] no.of iterations:  2 Error:  2339923927.09941
g:  [[1.62257044e+06]
 [3.62524526e+08]
 [9.61640370e+06]
 [1.85512186e+08]
 [5.23679689e+09]
 [2.46554162e+07]
 [1.22723554e+08]
 [2.33101290e+06]] parameters:  [[ 0.22249606]
 [ 0.34205385]
 [ 0.71280577]

In [129]:
print("R Squared Error for SSE minimization: ",calculate_Rsquare(y,prediction_SSE))
print("R Squared Error for SED minimization: ",calculate_Rsquare(y,prediction_SED))
print("R Squared Error for ML minimization: ",calculate_Rsquare(y,prediction_ML))

R Squared Error for SSE minimization:  0.7249361842711037
R Squared Error for SED minimization:  -7.5357403858444965
R Squared Error for ML minimization:  0.8214780764810599


Dataset 2

url: https://archive.ics.uci.edu/dataset/60/liver+disorders

The first 5 variables are all blood tests which are thought to be sensitive to liver disorders that might arise from excessive alcohol consumption. Each line in the dataset constitutes the record of a single male individual. The regression predicts the number of half-pint equivalents of alcoholic beverages drunk per day by an individual.

Variables:

    Variable Name	        Role	        Type	

1.  mcv:                    Feature	        Continuous		

2.  alkphos:	            Feature	        Continuous

3.  sgpt:	                Feature	        Continuous

4.  sgot:	                Feature	        Continuous		

5.  gammagt:	            Feature	        Continuous		

6.  drinks:	                Target	        Continuous		



In [130]:
# fetch dataset 
liver_disorders = fetch_ucirepo(id=60) 
  
# data (as pandas dataframes) 
X = liver_disorders.data.features 
y = liver_disorders.data.targets 
  
# metadata 
print(liver_disorders.metadata) 
  
# variable information 
print(liver_disorders.variables) 

# # Handling the missing values -> Here we simply drop the rows that contain Nan values
# empty_rows = X[X["horsepower"].isna()].index
# empty_rows = empty_rows.tolist()

#X = X.drop(["selector"],axis=1)
# y = y.drop(empty_rows)
#print(X)
print(X.isna().sum(), y.isna().sum())

# converting pandas df to numpy array
X = X.to_numpy()
y = y.to_numpy()
# making first column of X as 1, to include bias
X = np.c_[np.ones((len(X),1)),X]

{'uci_id': 60, 'name': 'Liver Disorders', 'repository_url': 'https://archive.ics.uci.edu/dataset/60/liver+disorders', 'data_url': 'https://archive.ics.uci.edu/static/public/60/data.csv', 'abstract': 'BUPA Medical Research Ltd. database donated by Richard S. Forsyth', 'area': 'Life Science', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 345, 'num_features': 5, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['drinks'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2016, 'last_updated': 'Fri Sep 15 2023', 'dataset_doi': '10.24432/C54G67', 'creators': [], 'intro_paper': None, 'additional_info': {'summary': 'The first 5 variables are all blood tests which are thought to be sensitive to liver disorders that might arise from excessive alcohol consumption. Each line in the dataset constitutes the record of a single male individual.\n\nImportant note: The 7th fiel

In [131]:
inital_w = get_inital_w(X)

print("Inital guess for W: ",inital_w)

drinks_per_day_reg_SSE = LinerarRegression(X,y,inital_w,"SSE")
drinks_per_day_reg_SSE.fit()


drinks_per_day_reg_SED = LinerarRegression(X,y,inital_w,"EUC")
drinks_per_day_reg_SED.fit()


drinks_per_day_reg_ML = LinerarRegression(X,y,inital_w,"OLS")
drinks_per_day_reg_ML.ols()

prediction_SSE = drinks_per_day_reg_SSE.predict(X)
prediction_SED = drinks_per_day_reg_SED.predict(X)
prediction_ML = drinks_per_day_reg_ML.predict(X)

Inital guess for W:  [[0.5450313 ]
 [0.83867063]
 [0.72165407]
 [0.15291129]
 [0.6902461 ]
 [0.42323559]]
g:  [[  111081.92918358]
 [10041735.50996961]
 [ 7969816.31131596]
 [ 3610526.93120829]
 [ 2873308.10459183]
 [ 4880170.43850673]] parameters:  [[ 0.52805054]
 [-0.69637941]
 [-0.49666789]
 [-0.39901916]
 [ 0.25101209]
 [-0.32278145]] no.of iterations:  1 Error:  9216886.6353615
g:  [[  -81915.57463622]
 [-7410051.90761486]
 [-5868535.49488195]
 [-2680508.46046569]
 [-2107066.05595106]
 [-3630243.44737052]] parameters:  [[0.54057273]
 [0.43637304]
 [0.40043756]
 [0.01074215]
 [0.57311297]
 [0.232163  ]] no.of iterations:  2 Error:  5025571.746210616
g:  [[  60544.25762539]
 [5471982.45798915]
 [4345931.34784455]
 [1964674.29729287]
 [1569823.09275694]
 [2656364.88706832]] parameters:  [[ 0.53131751]
 [-0.40011254]
 [-0.26391196]
 [-0.28959173]
 [ 0.33313881]
 [-0.17390755]] no.of iterations:  3 Error:  2741385.2011008863
g:  [[  -44636.47124944]
 [-4038964.2751992 ]
 [-3195819.6691

In [132]:
print("R Squared Error for SSE minimization: ",calculate_Rsquare(y,prediction_SSE))
print("R Squared Error for SED minimization: ",calculate_Rsquare(y,prediction_SED))
print("R Squared Error for ML minimization: ",calculate_Rsquare(y,prediction_ML))

R Squared Error for SSE minimization:  0.13920573441101325
R Squared Error for SED minimization:  -0.30107234594234966
R Squared Error for ML minimization:  0.18829428215833388


Dataset 3:

url: https://archive.ics.uci.edu/dataset/844/average+localization+error+(ale)+in+sensor+node+localization+process+in+wsns

This data contains 6 columns (107x6). The first four columns are features, namely anchor ratio, the transmission range of a sensor, node density (here no. of sensor nodes), and iteration count. The fifth column is ALE (predictand)

Variable Name	            Role	        Type

1. anchor_ratio:	        Feature	        Integer

2. trans_range:	            Feature	        Integer

3. node_density:	        Feature	        Integer

4. iterations:	            Feature	        Integer

5. ale:	                    Target	        Continuous

In [133]:
ALE = pd.read_csv("Average Localization Error.csv")
ALE = pd.DataFrame(ALE)
#print(concrete_strength.head())
# data (as pandas dataframes) 
X = ALE.drop(['ale'],axis=1)
y = ALE['ale']
  
X = X.to_numpy()
y = y.to_numpy().reshape((107,1))
X = np.c_[np.ones((len(X),1)),X]

In [134]:
inital_w = get_inital_w(X)

print("Inital guess for W: ",inital_w)

ALE_reg_SSE = LinerarRegression(X,y,inital_w,"SSE")
ALE_reg_SSE.fit()


ALE_reg_SED = LinerarRegression(X,y,inital_w,"EUC")
ALE_reg_SED.fit()


ALE_reg_ML = LinerarRegression(X,y,inital_w,"OLS")
ALE_reg_ML.ols()

prediction_SSE = ALE_reg_SSE.predict(X)
prediction_SED = ALE_reg_SED.predict(X)
prediction_ML = ALE_reg_ML.predict(X)

Inital guess for W:  [[0.91108422]
 [0.54671107]
 [0.39124748]
 [0.60265405]
 [0.65632206]]
g:  [[  31219.33417143]
 [ 639234.44685693]
 [ 549673.53623053]
 [5612044.97415113]
 [1563485.03583912]] parameters:  [[ 0.9042665 ]
 [ 0.40711413]
 [ 0.27120899]
 [-0.6229124 ]
 [ 0.31488595]] no.of iterations:  1 Error:  2486777.076907484
g:  [[  -15267.91168049]
 [ -301829.7207934 ]
 [ -266503.60114145]
 [-3121723.04130843]
 [ -673417.46973142]] parameters:  [[0.90760073]
 [0.47302814]
 [0.32940843]
 [0.05881397]
 [0.46194782]] no.of iterations:  2 Error:  768198.6127164221
g:  [[  10067.09902292]
 [ 210748.3610948 ]
 [ 178261.33762864]
 [1654169.01794708]
 [ 540277.59214967]] parameters:  [[ 0.90540226]
 [ 0.42700461]
 [ 0.29047946]
 [-0.30242584]
 [ 0.34396122]] no.of iterations:  3 Error:  252595.3131539119
g:  [[  -3847.97710997]
 [ -71060.55810427]
 [ -66063.26187194]
 [-953580.94203075]
 [-131478.18182038]] parameters:  [[ 0.90624259]
 [ 0.44252292]
 [ 0.30490645]
 [-0.09418146]
 [ 0.37

In [135]:
print("R Squared Error for SSE minimization: ",calculate_Rsquare(y,prediction_SSE))
print("R Squared Error for SED minimization: ",calculate_Rsquare(y,prediction_SED))
print("R Squared Error for ML minimization: ",calculate_Rsquare(y,prediction_ML))

R Squared Error for SSE minimization:  0.423097337388579
R Squared Error for SED minimization:  -489.9817420322711
R Squared Error for ML minimization:  0.6696511103289811


***
As seen in the above implementation:

The gradient descent algorithm's performace is not stable and is dependent on the inital guess of W vector. And when the dimensions of input grows they seem to be less accurate. Where as the maximum likelyhood implementation always performs better than gradient descent approaches.

Also ML implementation does not depend on the inital guess. The optimal W vector is found using the closed form and is always the same and stable.

This implies that the weights found using the ML implementation is already optimized and can be used for prediction for a stable, efficient and accurate predictions.

***

# Problem 2

here we assume y = [0,1],0 represents "-", 1 represents "+"

In [136]:
# a) To find the accuracy of the Naive Bayes Classifer

import random

def naive_bayes(x1,x2,x3,d):

    # Probablity of x1,x2,x3 are same hence only 3 variables is used
    # This p_x list will be different if P(x|y=k) have different values
    p_x = (0.5,0.5,0.5)
    p_y = 0.5
    p_y1 = 1
    p_y2 = 1

    for j in range(d):
        p_y1 = p_y1 * p_x[j]
    p_y1 = p_y1 * p_y
    for j in range(d):
        p_y2 = p_y2 * p_x[j]
    p_y2 = p_y2 * p_y
    
    if p_y1 > p_y2:
        return 1
    elif p_y2 > p_y1:
        return 0
    else:
        return random.choice([0,1])
        

D_test = [[-1,-1,-1], [-1,-1,1], [-1,1,-1], [-1,1,1],
          [1,-1,-1], [1,-1,1], [1,1,-1], [1,1,1],]
d = 3

y = [1,0,0,1,0,1,1,0]
y_hat = []

for x in D_test:
    y_hat.append(naive_bayes(x[0],x[1],x[2],d))

print(y_hat)

accuracy = 0

for i in range(len(y)):
    if y[i] == y_hat[i]:
        accuracy += 1
print("Accuracy of the model: ", accuracy/len(y))


[0, 0, 1, 1, 0, 0, 0, 0]
Accuracy of the model:  0.5


We can see from the above code, the accuracy of the model is not a constant and varies. This is because since the probablity of all features given y and the probablity of y is saame. That is the probablity of P(y = 0) and p(y=1) are the same everytime and the model returns a random varaiable from [0,1].

In [137]:
# b) To find the accuracy of the Naive Bayes Classifer with higher dimension

def naive_bayes(x,d):

    # Probablity dictionar, each row represents each frature j, the nested dicitonary respresents the probablity p(xj | y = k) => k = [0,1]
    p_x = [{1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:1,1:0},-1:{0:0,1:1}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},
           {1:{0:0.5,1:0.5},-1:{0:0.5,1:0.5}},]
    p_y = 0.5

    y_pred = []

    #print(p_y1 * p_x[0][x[0][0]][1])

    for instant in x:
        p_y1 = 1
        p_y2 = 1
        for j in range(d):
            p_y1 = p_y1 * p_x[j][instant[j]][1]
            p_y2 = p_y2 * p_x[j][instant[j]][0]

        if p_y1*p_y > p_y2*p_y:
            y_pred.append(1)
        elif p_y2*p_y > p_y1*p_y:
            y_pred.append(0)
        else:
            y_pred.append(random.choice([0,1]))
            
    
    return y_pred


In [138]:
D_modified = [[] for i in range(len(D_test))]

for i in range(len(D_modified)):
    D_modified[i] += D_test[i]
    D_modified[i].append(D_test[i][0]*D_test[i][1])
    D_modified[i].append(D_test[i][0]*D_test[i][2])
    D_modified[i].append(D_test[i][1]*D_test[i][2])
    D_modified[i].append(D_test[i][0]*D_test[i][1]*D_test[i][2])

    D_modified[i].append(D_test[i][0]*D_test[i][0]*D_test[i][1])
    D_modified[i].append(D_test[i][0]*D_test[i][1]*D_test[i][1])
    D_modified[i].append(D_test[i][0]*D_test[i][2]*D_test[i][2])
    D_modified[i].append(D_test[i][1]*D_test[i][1]*D_test[i][2])
    D_modified[i].append(D_test[i][1]*D_test[i][2]*D_test[i][2])

#print(D_modified)
y_hat = naive_bayes(D_modified,12)

print(y_hat)

accuracy = 0

for i in range(len(y)):
    if y[i] == y_hat[i]:
        accuracy += 1
print("Accuracy of the model: ", accuracy/len(y))


[1, 0, 0, 1, 0, 1, 1, 0]
Accuracy of the model:  1.0


***
After incresing the dimension we can see that the naive bayes model's accuracy has improved to 1, that it the model always accuratly predicts the outcome. 

This implyes that there is a corelation among the features. Specifically we can see that the prediction is fully dependent on the 7th feature, that it if x7 = 1, then y is always and vise versa. x7 is derived from multyplying x1,x2 and x3. 

From the dataset D we can identify that if x1 is -1, based on whether x2 and x3 are the same y is + and vice versa 

when x is 1, if x2 and x3 are same y is - and vice versa. This shows that all 3 features are co-related and by multiplying x1,x2,x3 we condense the information into a single dimension.
***