#### Import Statements

In [None]:
from sklearn import datasets
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
import sklearn.metrics as sm
import numpy as np
import matplotlib.pyplot as plt
import operator

Load Data

In [None]:
# Load dataset
diab = load_diabetes()
x = diab.data
y = diab.target

y = np.expand_dims(y,1)

#### Display Feature Names

In [None]:
print(diab.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


#### Shuffle Dataset

In [None]:
# Create new dataset, target appended in last column
data = np.append(x, y, 1)

# Shuffle dataset
np.random.shuffle(data)

#### Train Dataset

In [None]:
# Number of samples
total_sample = len(data)

# Train/Dev/Test Splits
train_x = x[:int(total_sample*0.70)] # 70%
train_y = y[:int(total_sample*0.70)]

dev_x = x[int(total_sample*0.70):int(total_sample*0.85)] # 15%
dev_y = y[int(total_sample*0.70):int(total_sample*0.85)]

test_x = x[int(total_sample*0.85):] # 15%
test_y = y[int(total_sample*0.85):]

#### Multivariate Regression Model (From Scratch)


In [None]:
# Shape x for multivariate reg
x = np.vstack((np.ones((x.shape[0], )), x.T)).T
test_x = np.vstack((np.ones((test_x.shape[0], )), test_x.T)).T

# Multivariate Linear Regression Function
def linear_reg(x, y, lr, iteration):
  m = y.size
  theta = np.zeros((x.shape[1], 1))
  cost_list = []

  for i in range(iteration):
    y_pred = np.dot(x, theta)
    cost = compute_cost(y_pred, y)
    theta = gradient(theta, x, y, y_pred, lr)
    cost_list.append(cost)

    if(i%(iteration/10) == 0):
      print("Cost is: ", cost)

  return theta, cost_list, y_pred

  # Cost Function
def compute_cost(y_pred, y_true):
  m = len(y_true)
  J = (1/(2*m))*np.sum(np.square(y_pred - y_true))
  return J

def gradient(theta, x, y_true, y_pred, lr):
  m = len(y_true)
  n_theta = (1/m)*np.dot(x.T, y_pred-y_true)
  temp_theta = theta - (lr*n_theta)
  return temp_theta

In [None]:
# Training Data
iteration = 1000
lr = .5
theta, cost_list, y_pred = linear_reg(train_x, train_y, lr, iteration)

Cost is:  14194.504854368934
Cost is:  13751.726141582902
Cost is:  13531.102034083733
Cost is:  13408.402749580442
Cost is:  13331.892666748174
Cost is:  13279.362655635061
Cost is:  13240.773017874793
Cost is:  13211.210618402949
Cost is:  13188.006888770382
Cost is:  13169.539775941426


In [None]:
# Dev Data
iteration = 1000
lr = .5
theta, cost_list, y_pred = linear_reg(dev_x, dev_y, lr, iteration)

Cost is:  16693.295454545456
Cost is:  14160.459172563867
Cost is:  13153.726191164891
Cost is:  12729.45268521753
Cost is:  12533.094881995625
Cost is:  12430.063052675354
Cost is:  12368.247649324621
Cost is:  12326.728139852608
Cost is:  12296.567757848416
Cost is:  12273.572457116883


In [None]:
# Test Data
iteration = 1000
lr = .5
theta, cost_list, y_pred = linear_reg(test_x, test_y, lr, iteration)

Cost is:  13994.044776119403
Cost is:  2227.973160093177
Cost is:  1845.3955191256596
Cost is:  1637.780392465229
Cost is:  1515.132266360214
Cost is:  1436.541074481347
Cost is:  1382.5663927937655
Cost is:  1343.4354271032341
Cost is:  1313.9029795193082
Cost is:  1290.9524642564018


In [None]:
# Accuracy Rating
print("R2 Accuracy score is: ", round(sm.r2_score(test_y, y_pred), 2))

print("Mean squared error:{}" .format(mse(test_y, y_pred)))

R2 Accuracy score is:  0.57
Mean squared error:2545.786641683694


#### Multivariate Regression Model (Using SciKit Learn)

In [None]:
# # Create model
# model = LinearRegression()

# # Fit model
# model.fit(train_x, train_y)

# # Predict y value
# y_pred = model.predict(test_x)
# # print(y_pred)

# # Prints the Accuracy Rating 
# print("R2 Accuracy score is: ", round(sm.r2_score(test_y, y_pred), 2))
# print(mse(train_y, y_pred), mae(train_y, y_pred))

# # Mean Squared Error
# print("Mean squared error:{}" .format(mse(test_y, y_pred)))