In [1]:
from sklearn.datasets import load_boston #import dataset
import numpy as np #data processing
from random import randint #generate random numbers

In [2]:
n_epochs = 7000 #number of training iterations
batch_size = 100 #number of samples every iteration
learning_rate = 0.000002 #how much the model updates its weights every training iteration

In [3]:
def preprocess(boston_data, split_ratio): 
    """preprocess the dataset by splitting data for training and testing

    Args:
        boston_data (class object): collection of data from the sklearn dataset
        split_ratio (float): percentage of data from dataset used for training. Percentage
            of data used for testing is 1 - split_ratio
    
    Returns:
        tuple: contains traning and testing inputs and outputs
    """
    x = boston_data.data
    y = boston_data.target
    n_data = x.shape[0]
    n_training = int(split_ratio * n_data)
    x_train = np.empty(shape=(n_training, 13))
    y_train = np.empty(shape=(n_training, 1))
    for i in range(n_training):
        index = randint(0, n_data - 1)
        x_train[i] = x[index]
        a = x[index]
        y_train[i] = y[index]
        x = np.delete(x, index, axis=0)
        y = np.delete(y, index, axis=0)
        n_data -= 1
    return x_train, x, y_train, y

In [4]:
def init_weights_and_bias(training_size):
    """initialize weights and bias matrices for training the linear regression model
    
    Args: 
        training_size (int): number of training samples
    
    Returns:
        tuple: contains the initial weights and biases
    """
    weights = np.random.rand(13, 1)
    bias = 0.1
    return weights, bias

In [5]:
def mean_square_error(predicted, actual):
    """Cost function used to evaluate how accurate the model is during training

    Args:
        predicted (numpy array of floats): predicted house prices during training
        actual (numpy array of floats): real-world prices of the inputs from the dataset

    Returns:
        float: the mean squared error between the predicted and actual house prices
    """
    error = sum(data ** 2 for data in (actual - predicted))/predicted.shape[0]
    return error


def gradient_descent(inputs, predictions, actuals, weights, biases):
    """Perform the stochastic gradient descent algorithm to update the weights during each training epoch

    Args:
        inputs (numpy array of floats): array of features used during the training epoch
        predictions (numpy array of floats): predicted house prices during training
        actuals (numpy array of floats): real-world prices of the inputs from the dataset
        weights (numpy array of floats): the weights used for training the model
        biases (numpy array of floats): the biases used for training the model

    Returns:
        tuple: contains the updated weights and biases
    """
    weights_gradient = -(2/float(inputs.shape[0])) * sum((actuals - predictions) * inputs)
    weights_gradient = weights_gradient.reshape((13, 1))
    bias_gradient = -2/(float(inputs.shape[0])) * sum(actuals - predictions)
    weights -= learning_rate * weights_gradient
    biases -= learning_rate * bias_gradient
    return weights, biases

In [6]:
def get_batch(inputs, outputs):
    """Randomly select samples from the dataset for training

    Args:
        inputs (numpy array of floats): array of all features from dataset
        outputs (numpy array of floats): array of all house prices from dataset

    Returns:
        tuple: contains the randomly sampled features and their associated house prices
    """
    batch_data = np.empty(shape=(batch_size, 13))
    batch_outputs = np.empty(shape=(batch_size, 1))
    for i in range(batch_size):
        index = randint(0, inputs.shape[0] - 1)
        batch_data[i] = inputs[index]
        batch_outputs[i] = outputs[index]
        np.delete(inputs, index, axis=0)
        np.delete(outputs, index, axis=0)
    return batch_data, batch_outputs

In [7]:
if __name__ == '__main__':
    """Preprocess, train, and test the linear regression model
    """
    boston = load_boston()
    x_train, x_test, y_train, y_test = preprocess(boston, 0.67)
    n_training = x_train.shape[0]
    n_testing = x_test.shape[0]
    weights, bias = init_weights_and_bias(n_training)
    for i in range(n_epochs):
        batch_inputs, batch_outputs = get_batch(x_train, y_train)
        prediction = np.matmul(batch_inputs, weights) + bias
        error = mean_square_error(prediction, batch_outputs)
        weights, bias = gradient_descent(batch_inputs, prediction, batch_outputs, weights, bias)
        if i % 100 == 0:
            print('current error: {}'.format(error))

current error: [196474.29156605]
current error: [334.75307549]
current error: [238.41543718]
current error: [223.89343172]
current error: [148.93078561]
current error: [130.63266577]
current error: [135.33136829]
current error: [87.55071806]
current error: [129.94282663]
current error: [93.82541332]
current error: [51.25328484]
current error: [79.94536364]
current error: [88.07887506]
current error: [66.09070635]
current error: [69.05320526]
current error: [56.23409563]
current error: [66.96081367]
current error: [69.72252681]
current error: [45.99015439]
current error: [45.40322439]
current error: [68.99047429]
current error: [75.00448495]
current error: [55.46142398]
current error: [67.18959587]
current error: [56.3897235]
current error: [63.75708621]
current error: [53.2693406]
current error: [61.23297307]
current error: [59.9463512]
current error: [68.50031158]
current error: [69.41209782]
current error: [66.08751746]
current error: [60.1362359]
current error: [70.56221053]
current

In [8]:
     for i in range(n_testing):
        actual = y_test[i]
        predicted = np.matmul(x_test[i], weights) + bias
        print('actual: {0}, predicted: {1}'.format(actual, predicted[0]))

actual: 24.0, predicted: 20.58808541184379
actual: 34.7, predicted: 25.81193568726838
actual: 27.1, predicted: 19.156572054712836
actual: 16.5, predicted: 13.264338820727751
actual: 20.4, predicted: 23.215251604859997
actual: 23.1, predicted: 22.40895332895333
actual: 20.2, predicted: 16.500309097792442
actual: 19.6, predicted: 21.094405315684252
actual: 15.2, predicted: 19.087800835446412
actual: 15.6, predicted: 20.25635802682467
actual: 21.0, predicted: 22.372464710065994
actual: 12.7, predicted: 15.849022965092496
actual: 13.2, predicted: 9.227868023798486
actual: 21.0, predicted: 21.768573050486232
actual: 30.8, predicted: 33.032777520305224
actual: 21.2, predicted: 22.41148263301293
actual: 18.9, predicted: 21.68211887476067
actual: 24.7, predicted: 30.635848986495144
actual: 25.0, predicted: 29.20411722952175
actual: 22.8, predicted: 23.31102118267285
actual: 20.0, predicted: 19.29589614725286
actual: 20.8, predicted: 19.365593476417942
actual: 28.0, predicted: 26.79522321227154