In [121]:
import pandas as pd
import numpy as np
import sys
import os
np.random.seed(0)

In [122]:
#Global variables
base_data_loc = "../data/regression"
data_name = "fish"
category_col_list = "Species"
target_col_name = "Weight"

In [123]:
def read_data(file_loc):
    df = pd.read_csv(file_loc)
    return df

def generate_file_location(name,base_data_loc=base_data_loc):
    return os.path.join(base_data_loc, name)

In [124]:
data_loc_dict = {'fish':generate_file_location("fish.csv"), 'insurance':generate_file_location("insurance.csv"),'real estate':generate_file_location("real_estate.csv"), 'physical activity':generate_file_location("physical_activity_obesity.csv")}

df_inp = read_data(data_loc_dict[data_name])
print(df_inp.shape)
print(df_inp.head())

(159, 7)
  Species  Weight  Length1  Length2  Length3   Height   Width
0   Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1   Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2   Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3   Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4   Bream   430.0     26.5     29.0     34.0  12.4440  5.1340


In [125]:

#dropping the category columns from input data for simplicity
df_inp.drop(category_col_list,axis=1,inplace=True)

print("DF after dropping categorical columns")
print(df_inp.head())

df_target = df_inp[target_col_name]
df_inp.drop([target_col_name],axis=1,inplace=True)
df_inp[target_col_name] = df_target
print(df_inp.head())


DF after dropping categorical columns
   Weight  Length1  Length2  Length3   Height   Width
0   242.0     23.2     25.4     30.0  11.5200  4.0200
1   290.0     24.0     26.3     31.2  12.4800  4.3056
2   340.0     23.9     26.5     31.1  12.3778  4.6961
3   363.0     26.3     29.0     33.5  12.7300  4.4555
4   430.0     26.5     29.0     34.0  12.4440  5.1340
   Length1  Length2  Length3   Height   Width  Weight
0     23.2     25.4     30.0  11.5200  4.0200   242.0
1     24.0     26.3     31.2  12.4800  4.3056   290.0
2     23.9     26.5     31.1  12.3778  4.6961   340.0
3     26.3     29.0     33.5  12.7300  4.4555   363.0
4     26.5     29.0     34.0  12.4440  5.1340   430.0


In [126]:
df_inp[df_inp[target_col_name]==450]

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight
5,26.8,29.7,34.7,13.6024,4.9274,450.0
8,27.6,30.0,35.1,14.0049,4.8438,450.0


In [129]:
dev = []
val = []
np.random.seed(0)
inp_values = df_inp.values
for  i in range(len(inp_values)):
    if np.random.rand() < 0.75:
        dev.append(inp_values[i])
    else:
        val.append(inp_values[i])
dev = np.array(dev)
val = np.array(val)

print(dev[0:2])


[[ 23.2     25.4     30.      11.52     4.02   242.    ]
 [ 24.      26.3     31.2     12.48     4.3056 290.    ]]


In [130]:
print(f"dev data shape {dev.shape}")
print(f"val data shape {val.shape}")

dev data shape (124, 6)
val data shape (35, 6)


In [131]:
x_dev, y_dev = dev[:,0:-1], dev[:,-1]
x_val, y_val = val[:,0:-1], val[:,-1]

In [132]:
print(f"shape of dev data - independent variable - {x_dev.shape}, dependent variable - {y_dev.shape}")
print(f"shape of val data - independent variable - {x_val.shape}, dependent variable - {y_val.shape}")

print(x_val[0:2])
print(y_val[0:2])

shape of dev data - independent variable - (124, 5), dependent variable - (124,)
shape of val data - independent variable - (35, 5), dependent variable - (35,)


[[27.6    30.     35.     12.67    4.69  ]
 [27.6    30.     35.1    14.0049  4.8438]]
[390. 450.]


In [160]:
def forward_propagation(dev_x, weights, bias_term):
    
    y_prediction = dev_x.dot(weights) + bias_term
    return y_prediction


In [143]:
def compute_cost(dev_y, pred_y):
    
    return (1/2*dev_y.shape[0])*(np.sum(pred_y - dev_y) ** 2)

In [163]:
def backward_propagation(pred_y, dev_x, dev_y, weights, bias_term, learning_rate):
    
    dW = -(2/dev_x.shape[0]) * (dev_x.T).dot(dev_y - pred_y)
    db = -(2/x_dev.shape[0]) * np.sum((dev_y - pred_y))

    weights = weights - learning_rate * dW
    bias_term = bias_term - learning_rate * db

    return weights, bias_term

In [164]:
#initializing weights for all the columns in dev data and bias
np.random.seed(0)
rows, columns = x_dev.shape
weights = np.random.rand(columns)
bias_term = np.random.rand()
print(f"old weights {weights}")
print(f"old bias term {bias_term}")


for i in range(20):
    y_pred = forward_propagation(x_dev, weights, bias_term)

    cost = compute_cost(y_dev, y_pred)
    print(f"cost for iteration = {i+1} is {cost}")

    weights, bias_term = backward_propagation(y_pred, x_dev, y_dev, weights, bias_term, 0.01)

old weights [0.5488135  0.71518937 0.60276338 0.54488318 0.4236548 ]
old bias term 0.6458941130666561
cost for iteration = 1 is 91293977784.52332
cost for iteration = 2 is 383756058894841.2
cost for iteration = 3 is 1.1493621781598611e+18
cost for iteration = 4 is 3.46213319801594e+21
cost for iteration = 5 is 1.0427620370548997e+25
cost for iteration = 6 is 3.140707414753762e+28
cost for iteration = 7 is 9.459533787697945e+31
cost for iteration = 8 is 2.849128165104917e+35
cost for iteration = 9 is 8.581322804361942e+38
cost for iteration = 10 is 2.5846187607346908e+42
cost for iteration = 11 is 7.784643802172421e+45
cost for iteration = 12 is 2.3446660701896124e+49
cost for iteration = 13 is 7.061927456673415e+52
cost for iteration = 14 is 2.12699027965568e+56
cost for iteration = 15 is 6.406307169687158e+59
cost for iteration = 16 is 1.9295232303096745e+63
cost for iteration = 17 is 5.8115538292031235e+66
cost for iteration = 18 is 1.7503887685407627e+70
cost for iteration = 19 is 5