In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from tabulate import tabulate

from sklearn import linear_model, svm, tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [2]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [3]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [4]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    # trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [5]:
def apply_method(X_train, y_train, X_test, y_test, method, regressor, crime_type):
    reg = regressor()
    reg.fit(X_train, y_train[crime_type])
    pred = reg.predict(X_test)
    result = mean_squared_error(y_test[crime_type], pred)
    return method, crime_type, np.sqrt(result)

In [6]:
methods = {'Linear Regression': linear_model.LinearRegression, 
           'Support Vector Regression': svm.SVR, 
           'Regression Tree': tree.DecisionTreeRegressor, 
           'Random Forest': RandomForestRegressor, 
           'Gradient Boosting': GradientBoostingRegressor}
c_type = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults', 
          'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

result = []
for ct in c_type:
    print(ct)
    for name in methods:
        result.append(apply_method(X_train, y_train, X_test, y_test, name, methods[name], ct))

Assault
Break and Enter
Homicide
Robbery
Sexual Assaults
Theft From Vehicle
Theft Of Vehicle
Theft Over $5000


In [7]:
print(tabulate(result, headers=['Method', 'Crime Type', 'Error']))

Method                     Crime Type              Error
-------------------------  ------------------  ---------
Gradient Boosting          Assault             1.38515
Regression Tree            Assault             1.36554
Linear Regression          Assault             1.69315
Random Forest              Assault             1.37402
Support Vector Regression  Assault             1.97045
Gradient Boosting          Break and Enter     1.30733
Regression Tree            Break and Enter     1.31765
Linear Regression          Break and Enter     1.33849
Random Forest              Break and Enter     1.32313
Support Vector Regression  Break and Enter     1.39633
Gradient Boosting          Homicide            0.0674277
Regression Tree            Homicide            0.0682506
Linear Regression          Homicide            0.0660757
Random Forest              Homicide            0.0684531
Support Vector Regression  Homicide            0.116096
Gradient Boosting          Robbery             0.506

In [8]:
final = pd.DataFrame(data=result)
final.columns = ['Method', 'Crime Type', 'RMSE']
final

Unnamed: 0,Method,Crime Type,RMSE
0,Gradient Boosting,Assault,1.385146
1,Regression Tree,Assault,1.365542
2,Linear Regression,Assault,1.693151
3,Random Forest,Assault,1.374019
4,Support Vector Regression,Assault,1.970452
5,Gradient Boosting,Break and Enter,1.307327
6,Regression Tree,Break and Enter,1.317646
7,Linear Regression,Break and Enter,1.338489
8,Random Forest,Break and Enter,1.323127
9,Support Vector Regression,Break and Enter,1.396328


In [9]:
gf = final.groupby(by='Crime Type')

In [10]:
gf.mean()

Unnamed: 0_level_0,RMSE
Crime Type,Unnamed: 1_level_1
Assault,1.557662
Break and Enter,1.336583
Homicide,0.077261
Robbery,0.530839
Sexual Assaults,0.438913
Theft From Vehicle,1.915866
Theft Of Vehicle,1.102575
Theft Over $5000,0.278992
