In [24]:
import pandas as pd
import seaborn as sns
import numpy as np

from tabulate import tabulate

from sklearn import linear_model, svm, tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [25]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [26]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [27]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    # trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [28]:
def apply_method(X_train, y_train, X_test, y_test, method, regressor, crime_type):
    reg = regressor()
    reg.fit(X_train, y_train[crime_type])
    pred = reg.predict(X_test)
    result = mean_squared_error(y_test[crime_type], pred)
    return method, crime_type, np.sqrt(result)

In [29]:
methods = {'Linear Regression': linear_model.LinearRegression, 
           'Support Vector Regression': svm.SVR, 
           'Regression Tree': tree.DecisionTreeRegressor, 
           'Random Forest': RandomForestRegressor, 
           'Gradient Boosting': GradientBoostingRegressor}
c_type = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults', 
          'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

result = []
for ct in c_type:
    print(ct)
    for name in methods:
        result.append(apply_method(X_train, y_train, X_test, y_test, name, methods[name], ct))

Assault
Break and Enter
Homicide
Robbery
Sexual Assaults
Theft From Vehicle
Theft Of Vehicle
Theft Over $5000


In [30]:
print(tabulate(result, headers=['Method', 'Crime Type', 'Error']))

Method                     Crime Type              Error
-------------------------  ------------------  ---------
Support Vector Regression  Assault             1.97045
Regression Tree            Assault             1.36554
Random Forest              Assault             1.37151
Linear Regression          Assault             1.69315
Gradient Boosting          Assault             1.38515
Support Vector Regression  Break and Enter     1.39633
Regression Tree            Break and Enter     1.31765
Random Forest              Break and Enter     1.3141
Linear Regression          Break and Enter     1.33849
Gradient Boosting          Break and Enter     1.30733
Support Vector Regression  Homicide            0.116096
Regression Tree            Homicide            0.0682506
Random Forest              Homicide            0.068492
Linear Regression          Homicide            0.0660757
Gradient Boosting          Homicide            0.0674277
Support Vector Regression  Robbery             0.58602

In [31]:
final = pd.DataFrame(data=result)
final.columns = ['Method', 'Crime Type', 'RMSE']
final

Unnamed: 0,Method,Crime Type,RMSE
0,Support Vector Regression,Assault,1.970452
1,Regression Tree,Assault,1.365542
2,Random Forest,Assault,1.371509
3,Linear Regression,Assault,1.693151
4,Gradient Boosting,Assault,1.385146
5,Support Vector Regression,Break and Enter,1.396328
6,Regression Tree,Break and Enter,1.317646
7,Random Forest,Break and Enter,1.3141
8,Linear Regression,Break and Enter,1.338489
9,Gradient Boosting,Break and Enter,1.307327


In [32]:
gf = final.groupby(by='Crime Type')

In [33]:
gf.mean()

Unnamed: 0_level_0,RMSE
Crime Type,Unnamed: 1_level_1
Assault,1.55716
Break and Enter,1.334778
Homicide,0.077268
Robbery,0.53011
Sexual Assaults,0.438721
Theft From Vehicle,1.914691
Theft Of Vehicle,1.102926
Theft Over $5000,0.278338
