In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

DATA_DIR = 'community-crime/'
STEP_SIZE = 1e-5

''' Prepare our data by cleaning and splitting it '''
def prepareData():
    # clean data
    df = pd.read_csv('{}raw_data.csv'.format(DATA_DIR), header=None)
    df.drop([0, 1, 2, 3, 4], axis=1, inplace=True)
    df = df.replace('?', np.NaN).astype(np.float64)
    df.fillna(df.mean(), inplace=True)
    df.to_csv('{}cleaned_data.csv'.format(DATA_DIR), index=False, header=False)
    # split data
    for i in range(1, 6):
        msk = np.random.rand(len(df)) < 0.8
        train = df[msk]
        test = df[~msk]
        train.to_csv('{}CandC−train{}.csv'.format(DATA_DIR, i), index=False, header=False)
        test.to_csv('{}CandC−test{}.csv'.format(DATA_DIR, i), index=False, header=False)

''' Creates the initial matrices needed for the output computation '''
def createComputationMatrices(train_data, test_data):
    # create vector
    train_data_output = pd.DataFrame([train_data[train_data.shape[1] - 1]])
    test_data_output = pd.DataFrame([test_data[test_data.shape[1] - 1]])
    # modify input matrix (shift and add column of ones)
    for i in reversed(range(1, train_data.shape[1])):
        train_data[i] = train_data[i - 1]
    train_data.drop([0], axis=1, inplace=True)
    train_data.insert(0, 0, np.ones(train_data.shape[0]))
    for i in reversed(range(1, test_data.shape[1])):
        test_data[i] = test_data[i - 1]
    test_data.drop([0], axis=1, inplace=True)
    test_data.insert(0, 0, np.ones(test_data.shape[0]))
    # create w vector
    w = pd.DataFrame(np.random.randint(low=0, high=10, size=(1, train_data.shape[1])))
    return train_data, train_data_output, test_data, test_data_output, w

''' Find the weighted matrix '''
def findWeightedMatrix(input_values, output_values, w, lambd):
    lambd_matrix = np.identity(input_values.shape[1]) * lambd
    inv_matrix = np.linalg.pinv(np.matmul(input_values.T, input_values) + lambd_matrix)
    XtY = np.matmul(input_values.T, output_values.T)
    w = np.matmul(inv_matrix, XtY)
    return w

''' Calculates the mean squared area '''
def calculateMSE(w, test_data_input, test_data_output):
    predicted_output = np.matmul(w, test_data_input.T)
    squared_error = np.power(np.subtract(predicted_output, test_data_output), 2)
    return np.sum(np.sum(squared_error)) / squared_error.size

''' Performs the linear regression '''
def performRegression():
    mse_values, w_values = [], []
    for i in range(1, 6):
        train_data = pd.read_csv('{}CandC−train{}.csv'.format(DATA_DIR, i), header=None)
        test_data = pd.read_csv('{}CandC−test{}.csv'.format(DATA_DIR, i), header=None)
        train_data_input, train_data_output, test_data_input, test_data_output, w = createComputationMatrices(train_data, test_data)
        for j in range(10000):
            predicted_output = np.matmul(w, train_data_input.T)
            loss = np.matmul(np.subtract(predicted_output, train_data_output), train_data_input) / train_data_output.shape[0]
            w = w - STEP_SIZE * loss
        mse = calculateMSE(w, test_data_input, test_data_output)
        mse_values.append(mse)
        w_values.append(w.values.tolist()[0])
    return mse_values, w_values

''' Compute optimal parameters for lambda and its corresponding mse value'''
def performRidgeRegression():
    lambd_values = [0.0, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]
    best_lambda = 0.0
    best_mse = 10000
    mse_values = []
    w_values = []
    for i in range(len(lambd_values)):
        total_mse = 0
        current_w_values = []
        for j in range(1, 6):
            train_data = pd.read_csv('{}CandC−train{}.csv'.format(DATA_DIR, j), header=None)
            test_data = pd.read_csv('{}CandC−test{}.csv'.format(DATA_DIR, j), header=None)
            train_data_input, train_data_output, test_data_input, test_data_output, w = createComputationMatrices(train_data, test_data)
            w = findWeightedMatrix(train_data_input, train_data_output, w, lambd_values[i])
            mse = calculateMSE(w.T, test_data_input, test_data_output)
            total_mse += mse
            current_w_values.append(w)
        average_mse = total_mse/5
        mse_values.append(average_mse)
        # calculate average parameters
        average_w = []
        for k in range(len(current_w_values[0])):
            average = 0
            for l in range(len(current_w_values)):
                average += current_w_values[l][k][0]
            average_w.append(average/5)
        w_values.append(average_w)
        if (average_mse < best_mse):
            best_mse = average_mse
            best_lambda = lambd_values[i]
    return mse_values, w_values, lambd_values, best_mse, best_lambda

# Part 1
print('Part 1 - Preparing Data \n')
print('Prepared data can be found in {} folder \n'.format(DATA_DIR))
prepareData()

Part 1 - Preparing Data 

Prepared data can be found in community-crime/ folder 



In [2]:
# Part 2
print('Part 2 - Linear Regression \n')
mse_value, w_values = performRegression()
print('5-fold cross-validation error: ', sum(mse_value)/5, '\n')
for i in range(5):
    print('Set {} parameters: '.format(i+1), w_values[i], '\n')

Part 2 - Linear Regression 

5-fold cross-validation error:  0.6337173601828235 

Set 1 parameters:  [-2.702187761679374, -2.4626742952908085, 0.3062492487633089, 1.8819205098203526, 1.8983546979442705, 0.2563482685388005, -0.7017216431087758, -0.20325833559375295, 0.3107992089332969, -0.6834044252325349, 0.2119122357549303, -1.2831047341671034, 0.07974410157570062, -1.1377665472050158, -1.939108999607916, 0.07874528300515334, -2.6596340408269867, -2.7875617677542524, 1.1528151348411162, 2.5688826760640278, 2.837829856038558, 2.9591400658790628, -1.3671735978695956, 0.3171121877493808, -0.04635204915304271, -0.1325365096224043, 0.031059177835283575, -0.5958216039100273, -0.06700877767657087, 2.356954250271612, 2.787795273135069, -2.6856658998412213, 2.052473135491227, -1.391479972337442, 1.5677966222499495, -0.009429168629175057, -0.6805059557874789, 0.7741392439849714, -1.5755167151251535, 0.01749055588790268, 1.0141387859814939, 2.63408461915242, -1.8005383417507044, -3.2899340415761

In [5]:
# Part 3
print('Part 3 - Ridge Regression\n')
mse_values, w_values, lambd_values, best_mse, best_lambd = performRidgeRegression()
for i in range(len(mse_values)):
    print('Lambda Value: ', lambd_values[i], ', Average MSE: ', mse_values[i], '\n')
    print('Parameters: ', w_values[i], '\n')
print('Best Lambda Value: ', best_lambd)

Part 3 - Ridge Regression

Lambda Value:  0.0 , Average MSE:  0.41616949165723593 

Parameters:  [1.5286371989987146, 0.07525991808252534, -0.03810852691084939, 0.17524113118169005, -0.05984925905719316, -0.03341682414401607, 0.0449277464010282, 0.12131680107694484, -0.23456676708784646, -0.13465649891489778, 0.04439124652761012, -0.2389369380070853, 0.048296180204264946, -0.2134996470624076, -0.15909509536154204, 0.04759080302046811, -0.18596605651208717, 0.1374042261667491, 0.013365095053209996, -0.10568805164749492, 0.35765258215057366, 0.1365798945136733, -0.4138735053250365, -0.03063957196508474, -0.037417459070917036, 0.023756777629308515, 0.039190124301627, 0.03385172554606371, 0.13859141631306732, -0.18841091003006546, -0.11820013810005611, 0.07927998058888143, 0.032455189023446795, 0.015659494124084662, 0.25223023306622067, -0.06559614052908808, -0.014398422795214083, 0.07213993486234678, 0.14012883975805288, 0.5060189664319504, 0.22443145360211142, 0.24080299389481566, -0.669