In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

DATA_DIR = 'community-crime/'
STEP_SIZE = 1e-5

''' Prepare our data by cleaning and splitting it '''
def prepareData():
    # clean data
    df = pd.read_csv('{}raw_data.csv'.format(DATA_DIR), header=None)
    df.drop([0, 1, 2, 3, 4], axis=1, inplace=True)
    df = df.replace('?', np.NaN).astype(np.float64)
    df.fillna(df.mean(), inplace=True)
    df.to_csv('{}cleaned_data.csv'.format(DATA_DIR), index=False, header=False)
    # split data
    for i in range(1, 6):
        msk = np.random.rand(len(df)) < 0.8
        train = df[msk]
        test = df[~msk]
        train.to_csv('{}CandC−train{}.csv'.format(DATA_DIR, i), index=False, header=False)
        test.to_csv('{}CandC−test{}.csv'.format(DATA_DIR, i), index=False, header=False)

''' Creates the initial matrices needed for the output computation '''
def createComputationMatrices(train_data, test_data):
    # create vector
    train_data_output = pd.DataFrame([train_data[train_data.shape[1] - 1]])
    test_data_output = pd.DataFrame([test_data[test_data.shape[1] - 1]])
    # modify input matrix (shift and add column of ones)
    for i in reversed(range(1, train_data.shape[1])):
        train_data[i] = train_data[i - 1]
    train_data.drop([0], axis=1, inplace=True)
    train_data.insert(0, 0, np.ones(train_data.shape[0]))
    for i in reversed(range(1, test_data.shape[1])):
        test_data[i] = test_data[i - 1]
    test_data.drop([0], axis=1, inplace=True)
    test_data.insert(0, 0, np.ones(test_data.shape[0]))
    # create w vector
    w = pd.DataFrame(np.random.randint(low=0, high=10, size=(1, train_data.shape[1])))
    return train_data, train_data_output, test_data, test_data_output, w

''' Find the weighted matrix '''
def findWeightedMatrix(input_values, output_values, w, lambd):
    lambd_matrix = np.identity(input_values.shape[1]) * lambd
    inv_matrix = np.linalg.pinv(np.matmul(input_values.T, input_values) + lambd_matrix)
    XtY = np.matmul(input_values.T, output_values.T)
    w = np.matmul(inv_matrix, XtY)
    return w

''' Calculates the mean squared area '''
def calculateMSE(w, test_data_input, test_data_output):
    predicted_output = np.matmul(w, test_data_input.T)
    squared_error = np.power(np.subtract(predicted_output, test_data_output), 2)
    return np.sum(np.sum(squared_error)) / squared_error.size

''' Performs the linear regression '''
def performRegression():
    mse_values, w_values = [], []
    for i in range(1, 6):
        train_data = pd.read_csv('{}CandC−train{}.csv'.format(DATA_DIR, i), header=None)
        test_data = pd.read_csv('{}CandC−test{}.csv'.format(DATA_DIR, i), header=None)
        train_data_input, train_data_output, test_data_input, test_data_output, w = createComputationMatrices(train_data, test_data)
        for j in range(10000):
            predicted_output = np.matmul(w, train_data_input.T)
            loss = np.matmul(np.subtract(predicted_output, train_data_output), train_data_input) / train_data_output.shape[0]
            w = w - STEP_SIZE * loss
        mse = calculateMSE(w, test_data_input, test_data_output)
        mse_values.append(mse)
        w_values.append(w.values.tolist()[0])
    return mse_values, w_values

''' Compute optimal parameters for lambda and its corresponding mse value'''
def performRidgeRegression():
    lambd_values = [0.0, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
    best_lambda = 0.0
    best_mse = 10000
    mse_values = []
    w_values = []
    for i in range(len(lambd_values)):
        total_mse = 0
        current_w_values = []
        for j in range(1, 6):
            train_data = pd.read_csv('{}CandC−train{}.csv'.format(DATA_DIR, j), header=None)
            test_data = pd.read_csv('{}CandC−test{}.csv'.format(DATA_DIR, j), header=None)
            train_data_input, train_data_output, test_data_input, test_data_output, w = createComputationMatrices(train_data, test_data)
            w = findWeightedMatrix(train_data_input, train_data_output, w, lambd_values[i])
            mse = calculateMSE(w.T, test_data_input, test_data_output)
            total_mse += mse
            current_w_values.append(w)
        average_mse = total_mse/5
        mse_values.append(average_mse)
        # calculate average parameters
        average_w = []
        for k in range(len(current_w_values[0])):
            average = 0
            for l in range(len(current_w_values)):
                average += current_w_values[l][k][0]
            average_w.append(average/5)
        w_values.append(average_w)
        if (average_mse < best_mse):
            best_mse = average_mse
            best_lambda = lambd_values[i]
    return mse_values, w_values, lambd_values, best_mse, best_lambda

# Part 1
print('Part 1 - Preparing Data \n')
print('Prepared data can be found in {} folder \n'.format(DATA_DIR))
prepareData()

Part 1 - Preparing Data 

Prepared data can be found in community-crime/ folder 



In [6]:
# Part 2
print('Part 2 - Linear Regression \n')
mse_value, w_values = performRegression()
print('5-fold cross-validation error: ', sum(mse_value)/5, '\n')
for i in range(5):
    print('Set {} parameters: '.format(i+1), w_values[i], '\n')

Part 2 - Linear Regression 

5-fold cross-validation error:  0.580086222335709 

Set 1 parameters:  [-1.409412703708712, 3.1224998542774487, 2.8341485880382424, -1.6121083372412937, -2.7980580026563064, 0.3375569232813019, -0.04290256742433749, 0.16625858351886794, 3.6177432238855194, -2.897714352054146, 4.285119828642319, -3.657235227135549, 0.17890730956902873, 1.2832501191047818, -2.147185945711142, 0.21145537851514784, 0.8144386519012913, -1.9519802634606307, -1.4151357479896685, 0.6459780824752297, -1.9225483634440437, 1.6401398473960231, -0.38274548772166245, 0.07939875773738048, 0.16840163906162212, 0.21195695001109832, 0.20946647833947427, -0.4762202096214533, -1.0640350811682153, -1.364821481461803, 2.0922890010549513, 1.371333478232676, 1.5663328626406121, 0.8073909564146545, 2.4003025457181257, -0.6156236588543672, -1.683768818575126, 0.25905914106218253, 2.2017322077073964, 1.044252128917803, 1.6504224669069107, 1.6687149679970348, -0.2542036081467246, -1.9592850449796886, 

In [10]:
# Part 3
print('Part 3 - Ridge Regression\n')
mse_values, w_values, lambd_values, best_mse, best_lambd = performRidgeRegression()
for i in range(len(mse_values)):
    print('Lambda Value: ', lambd_values[i], ', Average MSE: ', mse_values[i], '\n')
    print('Parameters: ', w_values[i], '\n')

Part 3 - Ridge Regression

Lambda Value:  0.0 , Average MSE:  1.487605761792865 

Parameters:  [1.7000659238579687, 0.2320676478634061, -0.05009897188505308, 0.18278840236048385, -0.042640023460950485, -0.03810163736685572, 0.05564904105449377, 0.15670789888612421, -0.24850141274897108, -0.1739459605159624, 0.03915866273981219, -0.32147887556090626, 0.050110005526534364, -0.1698394605610499, -0.16304484940903693, 0.03741492145094509, -0.19164155488316875, 0.1345974526508622, 0.026845252571255966, -0.10478798978941284, 0.28329234704940065, 0.0869352423567588, -0.3297874525099759, -0.030820555190710707, -0.036743068890778885, 0.027686300253530888, 0.044482567655122596, 0.029953329293532283, 0.07963970297083804, -0.18704219704583083, -0.10766008022080956, 0.05294386253355125, 0.05595431533638191, 0.004110964877503243, 0.24451331898424558, -0.057539783471023245, -0.004832075849321415, 0.06472324950152294, 0.08461662334174935, 0.4114321471464888, 0.252317699455054, 0.11505306265348789, -0.4