In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [27]:
insurance_data = pd.read_csv("insurance.txt", delimiter=',')

In [28]:
insurance_data

Unnamed: 0,age,bmi,children,charges
0,19,27.900,0,16884.92400
1,18,33.770,1,1725.55230
2,28,33.000,3,4449.46200
3,33,22.705,0,21984.47061
4,32,28.880,0,3866.85520
...,...,...,...,...
1333,50,30.970,3,10600.54830
1334,18,31.920,0,2205.98080
1335,18,36.850,0,1629.83350
1336,21,25.800,0,2007.94500


In [29]:
norm_insu_data = ((insurance_data-insurance_data.mean())/(insurance_data.std()))
norm_insu_data.insert(0, 'x0', 1) #x0 is appeneded to allow for the zeroth order term

In [30]:
#normalised data frame

norm_insu_data

Unnamed: 0,x0,age,bmi,children,charges
0,1,-1.438227,-0.453151,-0.908274,0.298472
1,1,-1.509401,0.509431,-0.078738,-0.953333
2,1,-0.797655,0.383164,1.580335,-0.728402
3,1,-0.441782,-1.305043,-0.908274,0.719574
4,1,-0.512957,-0.292447,-0.908274,-0.776512
...,...,...,...,...,...
1333,1,0.768185,0.050278,1.580335,-0.220468
1334,1,-1.509401,0.206062,-0.908274,-0.913661
1335,1,-1.509401,1.014499,-0.908274,-0.961237
1336,1,-1.295877,-0.797515,-0.908274,-0.930014


In [31]:
#creating a list of 20 data_frames that have been shuffled

dataset_list=list()
dataset_list.append(norm_insu_data.sample(frac=1))
for i in range(1,20):
    dataset_list.append(dataset_list[-1].sample(frac=1))


In [32]:
# Splitting the datasets into testing and training

training_data_list = [None]*20
testing_data_list = [None]*20

for i in range(20):
    training_data_list[i] = dataset_list[i].sample(frac=0.7)
    testing_data_list[i] = dataset_list[i].drop(training_data_list[i].index)

In [33]:
# Normal equation method to find regression model

indep_attrs = ["x0", "age", "bmi", "children"]
dep_attr = ["charges"]

testing_err_norm = []
training_err_norm = []
for i in range(20):
    
    #Calculating the weights
    X_train = training_data_list[i][indep_attrs].to_numpy() 
    Y_train = training_data_list[i][dep_attr].to_numpy()
    XT_train = X_train.transpose()

    X_test = testing_data_list[i][indep_attrs].to_numpy() 
    Y_test = testing_data_list[i][dep_attr].to_numpy()
    theta = np.matmul(np.linalg.inv(np.matmul(XT_train,X_train)),np.matmul(XT_train,Y_train))
    
    #Calculating RMSE for training data
    diff_train = np.subtract(np.matmul(X_train, theta), Y_train)
    diffT_train = diff_train.transpose()
    sumsq_train = np.matmul(diffT_train, diff_train)[0][0]
    rmse_train = np.sqrt(sumsq_train/(Y_train.shape[0]))
    training_err_norm.append(rmse_train)
    
    #Calculating RMSE for testing data

    diff_test = np.subtract(np.matmul(X_test, theta), Y_test)
    diffT_test = diff_test.transpose()
    sumsq_test = np.matmul(diffT_test, diff_test)[0][0]
    rmse_test = np.sqrt(sumsq_test/(Y_test.shape[0]))
    testing_err_norm.append(rmse_test)

In [34]:
#GRADIENT DESCENT
def gradient_descent(x, y, theta, iterations, alpha, x_test, y_test, precis):
    m = y.shape[0]
    n = y_test.shape[0]
    min_rmse_train = float('inf')
    min_rmse_test = float('inf')
    past_costs = []
    past_thetas = [theta]
    past_costs_test=[]
    for i in range(iterations):
        prediction = np.dot(x, theta)
        error = prediction - y
        cost = 1/(2*m) * np.dot(error.T, error)
        
        
        rmse_train = np.sqrt((2*(cost[0][0])))
        min_rmse_train = min(min_rmse_train,rmse_train)
        
        past_costs.append(cost[0][0])
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        
        prediction_test = np.dot(x_test, theta)
        error_test = prediction_test - y_test
        cost_test = 1/(2*n) * np.dot(error_test.T, error_test)
        
        rmse_test = np.sqrt((2*cost_test[0][0]))
        min_rmse_test = min(min_rmse_test, rmse_test)
        
        past_costs_test.append(cost_test[0][0])
        
        if((len(past_costs) > 1) and abs(past_costs[-2]-past_costs[-1]) <= precis):
            break
        
    return past_thetas, past_costs, past_costs_test, min_rmse_train, min_rmse_test


In [35]:
np.random.seed(123)
min_rmse_train_grad_desc = [None]*20
min_rmse_test_grad_desc = [None]*20
precision = 0.000001
for i in range(20):
    alpha = 0.01 
    iterations = 2000
    
    X_train = training_data_list[i][indep_attrs].to_numpy() 
    Y_train = training_data_list[i][dep_attr].to_numpy()
    
    X_test = testing_data_list[i][indep_attrs].to_numpy() 
    Y_test = testing_data_list[i][dep_attr].to_numpy()
    
    m = Y_train.shape[0]
    n = Y_test.shape[0]
    
    theta = np.random.rand(X_train.shape[1],1)
    past_thetas, past_costs, past_costs_test, min_rmse_train_grad_desc[i], min_rmse_test_grad_desc[i] = gradient_descent(X_train, Y_train, theta, iterations, alpha, X_test, Y_test, precision)
    