In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from tabulate import tabulate

from sklearn import linear_model, svm, tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor

In [2]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [3]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [4]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    # trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [5]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
linear_result = reg.predict(X_test)
linear_result

array([[  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       [  8.75780264e-01,   5.71679490e-01,   8.81496964e-04, ...,
          1.34451825e+00,   6.68084623e-01,   2.46806914e-02],
       [  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       ..., 
       [  2.45088560e+00,   1.33445664e+00,   8.61866085e-03, ...,
          2.16530139e+00,   1.11014401e+00,   7.27190480e-02],
       [  1.44842898e+00,   1.78036178e+00,   1.63637896e-02, ...,
          2.42116587e+00,   1.44029417e+00,   6.05693583e-02],
       [  2.55351978e+00,   1.21376702e+00,   7.36240344e-03, ...,
          1.78493302e+00,   1.08432143e+00,   3.03916923e-02]])

In [6]:
max_depth = 30

# Random forest with multioutput regressor
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
                                                          random_state=0))
regr_multirf.fit(X_train, y_train)
mo_rf_result = regr_multirf.predict(X_test)
mo_rf_result

array([[  2.38299912e-01,   4.22340693e-01,   7.43289206e-04, ...,
          6.01756912e-01,   3.70631341e-01,   7.43517491e-02],
       [  6.43424320e-01,   6.04844309e-01,   0.00000000e+00, ...,
          1.29759148e+00,   1.18874603e-01,   6.63540843e-02],
       [  2.38299912e-01,   4.22340693e-01,   7.43289206e-04, ...,
          6.01756912e-01,   3.70631341e-01,   7.43517491e-02],
       ..., 
       [  9.80072183e-01,   5.23478776e-01,   0.00000000e+00, ...,
          1.67352726e+00,   1.11632191e+00,   0.00000000e+00],
       [  8.83127706e-01,   7.44898565e-01,   0.00000000e+00, ...,
          1.31312452e+00,   1.26709617e+00,   3.50222816e-02],
       [  1.21934278e+00,   6.64552857e-01,   0.00000000e+00, ...,
          1.31954768e+00,   1.55810841e-01,   7.00220423e-02]])

In [7]:
# Just random forest
regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2)
regr_rf.fit(X_train, y_train)
y_rf = regr_rf.predict(X_test)
y_rf

array([[  2.41442322e-01,   4.21503476e-01,   7.58254153e-04, ...,
          6.00259988e-01,   3.72760487e-01,   7.39819741e-02],
       [  6.60340933e-01,   5.57940072e-01,   0.00000000e+00, ...,
          1.10763895e+00,   1.54429687e-01,   1.16819377e-01],
       [  2.41442322e-01,   4.21503476e-01,   7.58254153e-04, ...,
          6.00259988e-01,   3.72760487e-01,   7.39819741e-02],
       ..., 
       [  1.18897422e+00,   5.84820686e-01,   0.00000000e+00, ...,
          1.49640430e+00,   1.07140355e+00,   0.00000000e+00],
       [  9.62448107e-01,   7.97034493e-01,   0.00000000e+00, ...,
          1.13186508e+00,   1.31536477e+00,   3.80097680e-02],
       [  1.68639196e+00,   7.12649739e-01,   0.00000000e+00, ...,
          1.12166338e+00,   1.50475302e-01,   3.78006880e-02]])

In [8]:
crimes = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults',
       'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

In [9]:
result1 = []
for i in range(len(crimes)):
    result1.append([np.sqrt(mean_squared_error(y_test[crimes[i]], linear_result[:,i])), crimes[i], 'Linear Regression'])
    
result1

[[1.6931604960752744, 'Assault', 'Linear Regression'],
 [1.3384826875062119, 'Break and Enter', 'Linear Regression'],
 [0.06607571655220576, 'Homicide', 'Linear Regression'],
 [0.54111440161810143, 'Robbery', 'Linear Regression'],
 [0.43139676138785576, 'Sexual Assaults', 'Linear Regression'],
 [1.9536388941698706, 'Theft From Vehicle', 'Linear Regression'],
 [1.1055175972417137, 'Theft Of Vehicle', 'Linear Regression'],
 [0.27690759204934789, 'Theft Over $5000', 'Linear Regression']]

In [10]:
result2 = []
for i in range(len(crimes)):
    result2.append([np.sqrt(mean_squared_error(y_test[crimes[i]], mo_rf_result[:,i])), crimes[i], 'Multi Random Forest'])

result2

[[1.3750695312113566, 'Assault', 'Multi Random Forest'],
 [1.3218228413417059, 'Break and Enter', 'Multi Random Forest'],
 [0.068569793875729257, 'Homicide', 'Multi Random Forest'],
 [0.51176069204282626, 'Robbery', 'Multi Random Forest'],
 [0.43615848699521853, 'Sexual Assaults', 'Multi Random Forest'],
 [1.867185107457938, 'Theft From Vehicle', 'Multi Random Forest'],
 [1.0949743872334345, 'Theft Of Vehicle', 'Multi Random Forest'],
 [0.28117286233197142, 'Theft Over $5000', 'Multi Random Forest']]

In [11]:
result3 = []
for i in range(len(crimes)):
    result3.append([np.sqrt(mean_squared_error(y_test[crimes[i]], mo_rf_result[:,i])), crimes[i], 'Multi Random Forest'])
    
result3

[[1.3750695312113566, 'Assault', 'Multi Random Forest'],
 [1.3218228413417059, 'Break and Enter', 'Multi Random Forest'],
 [0.068569793875729257, 'Homicide', 'Multi Random Forest'],
 [0.51176069204282626, 'Robbery', 'Multi Random Forest'],
 [0.43615848699521853, 'Sexual Assaults', 'Multi Random Forest'],
 [1.867185107457938, 'Theft From Vehicle', 'Multi Random Forest'],
 [1.0949743872334345, 'Theft Of Vehicle', 'Multi Random Forest'],
 [0.28117286233197142, 'Theft Over $5000', 'Multi Random Forest']]

In [12]:
result2.extend(result3)
# result1.extend(result3)
result2

[[1.3750695312113566, 'Assault', 'Multi Random Forest'],
 [1.3218228413417059, 'Break and Enter', 'Multi Random Forest'],
 [0.068569793875729257, 'Homicide', 'Multi Random Forest'],
 [0.51176069204282626, 'Robbery', 'Multi Random Forest'],
 [0.43615848699521853, 'Sexual Assaults', 'Multi Random Forest'],
 [1.867185107457938, 'Theft From Vehicle', 'Multi Random Forest'],
 [1.0949743872334345, 'Theft Of Vehicle', 'Multi Random Forest'],
 [0.28117286233197142, 'Theft Over $5000', 'Multi Random Forest'],
 [1.3750695312113566, 'Assault', 'Multi Random Forest'],
 [1.3218228413417059, 'Break and Enter', 'Multi Random Forest'],
 [0.068569793875729257, 'Homicide', 'Multi Random Forest'],
 [0.51176069204282626, 'Robbery', 'Multi Random Forest'],
 [0.43615848699521853, 'Sexual Assaults', 'Multi Random Forest'],
 [1.867185107457938, 'Theft From Vehicle', 'Multi Random Forest'],
 [1.0949743872334345, 'Theft Of Vehicle', 'Multi Random Forest'],
 [0.28117286233197142, 'Theft Over $5000', 'Multi Rando

In [13]:
final = pd.DataFrame(data=result2)
final.columns = ['RMSE', 'Crime Type', 'Method']
final

Unnamed: 0,RMSE,Crime Type,Method
0,1.37507,Assault,Multi Random Forest
1,1.321823,Break and Enter,Multi Random Forest
2,0.06857,Homicide,Multi Random Forest
3,0.511761,Robbery,Multi Random Forest
4,0.436158,Sexual Assaults,Multi Random Forest
5,1.867185,Theft From Vehicle,Multi Random Forest
6,1.094974,Theft Of Vehicle,Multi Random Forest
7,0.281173,Theft Over $5000,Multi Random Forest
8,1.37507,Assault,Multi Random Forest
9,1.321823,Break and Enter,Multi Random Forest


In [14]:
gf = final.groupby(by='Crime Type')

In [15]:
gf.mean()

Unnamed: 0_level_0,RMSE
Crime Type,Unnamed: 1_level_1
Assault,1.37507
Break and Enter,1.321823
Homicide,0.06857
Robbery,0.511761
Sexual Assaults,0.436158
Theft From Vehicle,1.867185
Theft Of Vehicle,1.094974
Theft Over $5000,0.281173
