In [165]:
import pandas as pd
import numpy as np
import sys
import os
np.random.seed(0)

In [166]:
#Global variables
base_data_loc = "../data/regression"
data_name = "fish"
category_col_list = "Species"
target_col_name = "Weight"

In [167]:
def read_data(file_loc):
    df = pd.read_csv(file_loc)
    return df

def generate_file_location(name,base_data_loc=base_data_loc):
    return os.path.join(base_data_loc, name)

In [168]:
data_loc_dict = {'fish':generate_file_location("fish.csv"), 'insurance':generate_file_location("insurance.csv"),'real estate':generate_file_location("real_estate.csv"), 'physical activity':generate_file_location("physical_activity_obesity.csv")}

df_inp = read_data(data_loc_dict[data_name])
print(df_inp.shape)
print(df_inp.head())

(159, 7)
  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


In [169]:

#dropping the category columns from input data for simplicity
df_inp.drop(category_col_list,axis=1,inplace=True)

print("DF after dropping categorical columns")
print(df_inp.head())

df_target = df_inp[target_col_name]
df_inp.drop([target_col_name],axis=1,inplace=True)
df_inp[target_col_name] = df_target
print(df_inp.head())


DF after dropping categorical columns
   Weight  Length1  Length2  Length3   Height   Width
0   242.0     23.2     25.4     30.0  11.5200  4.0200
1   290.0     24.0     26.3     31.2  12.4800  4.3056
2   340.0     23.9     26.5     31.1  12.3778  4.6961
3   363.0     26.3     29.0     33.5  12.7300  4.4555
4   430.0     26.5     29.0     34.0  12.4440  5.1340
   Length1  Length2  Length3   Height   Width  Weight
0     23.2     25.4     30.0  11.5200  4.0200   242.0
1     24.0     26.3     31.2  12.4800  4.3056   290.0
2     23.9     26.5     31.1  12.3778  4.6961   340.0
3     26.3     29.0     33.5  12.7300  4.4555   363.0
4     26.5     29.0     34.0  12.4440  5.1340   430.0


In [170]:
dev = []
val = []
np.random.seed(0)
inp_values = df_inp.values
for  i in range(len(inp_values)):
    if np.random.rand() < 0.75:
        dev.append(inp_values[i])
    else:
        val.append(inp_values[i])
dev = np.array(dev)
val = np.array(val)

print(dev[0:2])

print(f"dev data shape {dev.shape}")
print(f"val data shape {val.shape}")


[[ 23.2     25.4     30.      11.52     4.02   242.    ]
 [ 24.      26.3     31.2     12.48     4.3056 290.    ]]
dev data shape (124, 6)
val data shape (35, 6)


In [171]:
x_dev, y_dev = dev[:,0:-1], dev[:,-1]
x_val, y_val = val[:,0:-1], val[:,-1]

print(f"shape of dev data - independent variable - {x_dev.shape}, dependent variable - {y_dev.shape}")
print(f"shape of val data - independent variable - {x_val.shape}, dependent variable - {y_val.shape}")

print(x_val[0:2])
print(y_val[0:2])

shape of dev data - independent variable - (124, 5), dependent variable - (124,)
shape of val data - independent variable - (35, 5), dependent variable - (35,)
[[27.6    30.     35.     12.67    4.69  ]
 [27.6    30.     35.1    14.0049  4.8438]]
[390. 450.]


In [172]:
def forward_propagation(dev_x, weights, bias_term):
    
    y_prediction = dev_x.dot(weights) + bias_term
    return y_prediction


In [186]:
def compute_cost(dev_y, pred_y):
    
    return (1/2*dev_y.shape[0])*(np.sum(pred_y - dev_y) ** 2)

In [179]:
def backward_propagation(pred_y, dev_x, dev_y, weights, bias_term, learning_rate):
    
    dW = -(2/dev_x.shape[0]) * (dev_x.T).dot(dev_y - pred_y)
    db = -(2/x_dev.shape[0]) * np.sum((dev_y - pred_y))

    updated_weights = weights - learning_rate * dW
    updated_bias_term = bias_term - learning_rate * db

    return updated_weights, updated_bias_term

In [190]:
#initializing weights for all the columns in dev data and bias
np.random.seed(0)
rows, columns = x_dev.shape
weights = np.random.rand(columns)
bias_term = np.random.rand()



for i in range(5):
    y_pred = forward_propagation(x_dev, weights, bias_term)

    cost = compute_cost(y_dev, y_pred)
    print(f"weights {weights}")
    print(f"bias term {bias_term}")
    print(f"cost for iteration = {i+1} is {cost}")

    weights, bias_term = backward_propagation(y_pred, x_dev, y_dev, weights, bias_term, 0.001)
    
    print()

weights [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
bias term 0.6458941130666561
cost for iteration = 1 is 91293977784.52332

weights [21.75445873 23.64781818 25.72307857  7.88114139  4.01169731]
bias term 1.2648127078891627
cost for iteration = 2 is 2846087857704.0107

weights [-75.68890252 -81.7842143  -89.70548362 -24.10313163 -12.11785241]
bias term -2.1908919389505863
cost for iteration = 3 is 55345073759685.92

weights [371.24784704 401.73844298 439.71378414 124.28717405  62.21382971]
bias term 13.047949528847418
cost for iteration = 4 is 1185678290652678.8

weights [-1679.47189507 -1816.90918866 -1989.48885863  -554.92797357
  -278.50338769]
bias term -57.48562598761338
cost for iteration = 5 is 2.4864169853359332e+16



In [214]:
class MultipleLinearRegression:

    def __init__(self, learning_rate, num_iterations, dev_x, dev_y) -> None:
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.rows, self.columns = dev_x.shape
        self.dev_x = dev_x
        self.dev_y = dev_y

    def forward_propagation(self, weights, bias_term):
        y_prediction = self.dev_x.dot(weights) + bias_term
        return y_prediction

    def compute_cost(self,dev_y, pred_y):
        return (1/2*self.rows)*(np.sum(dev_y - pred_y) ** 2)
    
    def backward_propagation(self, pred_y, dev_x, dev_y, weights, bias_term):
        dW = -(2* (dev_x.T).dot(dev_y - pred_y)) / self.rows
        db = -(2* np.sum(dev_y - pred_y)) / self.rows

        updated_weights = weights - self.learning_rate * dW
        updated_bias_term = bias_term - self.learning_rate * db

        return updated_weights, updated_bias_term

    

In [215]:
np.random.seed(42)
num_iterations = 10
learning_rate = 0.01

rows, columns = x_dev.shape
weights = np.random.rand(columns)
bias_term = np.random.rand()
lr = MultipleLinearRegression(learning_rate, num_iterations, x_dev, y_dev)

for i in range(10):
    y_pred = lr.forward_propagation(weights, bias_term)
    #print(y_pred[0:5])
    cost = lr.compute_cost(y_dev, y_pred)
    print(f"cost for iteration={i+1} is {cost}")
    weights, bias_term = lr.backward_propagation(y_pred, x_dev, y_dev, weights, bias_term)
    

cost for iteration=1 is 88449683195.49147
cost for iteration=2 is 373505702565864.56
cost for iteration=3 is 1.1185826774968252e+18
cost for iteration=4 is 3.369422906514777e+21
cost for iteration=5 is 1.0148385430411636e+25
cost for iteration=6 is 3.056604311454542e+28
cost for iteration=7 is 9.206222656645333e+31
cost for iteration=8 is 2.772833086065037e+35
cost for iteration=9 is 8.351528753801402e+38
cost for iteration=10 is 2.5154068189718112e+42
