In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from tabulate import tabulate

from sklearn import linear_model, svm, tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [3]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [4]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=trainm, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [5]:
def apply_method(X_train, y_train, X_test, y_test, method, regressor, crime_type):
    reg = regressor()
    reg.fit(X_train, y_train[crime_type])
    pred = reg.predict(X_test)
    result = mean_squared_error(y_test[crime_type], pred)
    return method, crime_type, result

In [6]:
methods = {'Linear Regression': linear_model.LinearRegression, 
           'Support Vector Regression': svm.SVR, 
           'Regression Tree': tree.DecisionTreeRegressor, 
           'Random Forest': RandomForestRegressor, 
           'Gradient Boosting': GradientBoostingRegressor, 
           'Neural Network Multi-layer Perceptron': MLPRegressor}
c_type = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults', 
          'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

result = []
for ct in c_type:
    for name in methods:
        result.append(apply_method(X_train, y_train, X_test, y_test, name, methods[name], ct))

In [7]:
print(tabulate(result, headers=['Method', 'Crime Type', 'Error']))

Method                                 Crime Type                Error
-------------------------------------  ------------------  -----------
Neural Network Multi-layer Perceptron  Assault             20.3441
Linear Regression                      Assault              2.86723
Support Vector Regression              Assault              2.533
Random Forest                          Assault              1.86371
Regression Tree                        Assault              1.86471
Gradient Boosting                      Assault              1.91863
Neural Network Multi-layer Perceptron  Break and Enter     15.316
Linear Regression                      Break and Enter      1.79165
Support Vector Regression              Break and Enter      1.88209
Random Forest                          Break and Enter      1.74259
Regression Tree                        Break and Enter      1.73619
Gradient Boosting                      Break and Enter      1.70911
Neural Network Multi-layer Perceptron  Homicide