In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# import data and feature scale
def load_data(filename):
    data = pd.read_csv(filename, header = None)
    data = np.array(data)
    X = data[:, 0:-2]
    y = data[:, -1]
    X = preprocessing.MinMaxScaler().fit_transform(X)
    return X, y

In [3]:
def grad_descent(f, initial, stepsize, threshold, X, y):
    diff=np.Inf
    w = initial
    cost, grad = f(X, y, initial)
    while diff > threshold:     
        w = w - stepsize * grad
        cost_old = cost
        cost, grad = f(X, y, w)
        diff = np.abs(cost-cost_old)
    return w

In [4]:
def sgd(f, initial, stepsize, epoch, X, y):
    w = initial
    # better shuffle the training data
    for e in range(epoch):
        for i in range(X.shape[0]):
            _, grad = f(X[i,:], y[i],w)
            w = w - stepsize * grad
    return w

Cost function:
$J(w)=\sum_i(y_i-x_i^Tw)^2$

Gradient: $\frac{\partial{J(w)}}{\partial{w_d}}=\sum_i-x_{id}2(y_i-x_i^Tw)$

In [5]:
def linear_regression(X, y, w):
    mse=np.mean((y-np.matmul(X, w))**2) #cost
    if type(y)==np.float64:
        grad = -2*(y-np.matmul(X, w))*np.transpose(X)
    else:
        grad = -2*np.matmul(np.transpose(X),(y-np.matmul(X, w)))
    return mse, grad

Test gradient descent and SGD

In [6]:
X_train, y_train = load_data('housing.csv')
X_ones = np.ones(X_train.shape[0])
X_train = np.concatenate((X_train, X_ones.reshape(-1,1)),axis=1)
w_init=np.ones(X_train.shape[1])

Before optimization:

In [7]:
linear_regression(X_train, y_train, w_init)

(376.34201972841527,
 array([  -278.55372446,  -2789.59612075,  -5319.31457794,  -1522.78509049,
         -4801.7897896 ,  -9778.54512351, -10286.00350521,  -4686.86840578,
         -4612.93530863,  -5462.17438534,  -9385.93292082, -16069.65366058,
        -17014.63369512]))

In [8]:
w = grad_descent(linear_regression, w_init, 1e-05, 0.01, X_train, y_train)

In [9]:
w_s = sgd(linear_regression, w_init, 1e-05, 10, X_train, y_train)

After optimization:

In [10]:
linear_regression(X_train, y_train, w)

(47.25729739425991,
 array([  69.3474786 ,  -44.5582515 ,  108.37777764, -133.75470706,
          48.28505656, -517.53069456,   76.10757324,  169.31693703,
        -107.82297455,   76.85189591,  363.48232898,   19.80782208,
        -119.34647434]))

In [11]:
linear_regression(X_train, y_train, w_s)

(239.87223598837005,
 array([   -58.74821862,  -2272.90248268,  -3294.21812426,  -1165.17857245,
         -2995.53995528,  -7201.8031896 ,  -6831.02157286,  -3539.82572194,
         -2637.42289223,  -3250.80579171,  -6233.50833065, -11599.94832859,
        -12059.43084812]))

In [12]:
w

array([-0.5454811 ,  5.14497303, -1.62133363,  3.66291856, -0.65539623,
       11.47738098,  0.95847502,  3.5283659 , -1.43904418, -2.44205211,
       -1.61602772,  9.37452843,  9.11817795])

In [13]:
w_s

array([1.01538063, 1.25156747, 1.42171281, 1.13306052, 1.38173332,
       1.83974541, 1.84230371, 1.40800228, 1.35090483, 1.42403581,
       1.76832054, 2.36802048, 2.43552477])