In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.multioutput import MultiOutputRegressor

In [2]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [3]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [4]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    # trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [5]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
linear_result = reg.predict(X_test)
linear_result_train = reg.predict(X_train)
linear_result

array([[  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       [  8.75780264e-01,   5.71679490e-01,   8.81496964e-04, ...,
          1.34451825e+00,   6.68084623e-01,   2.46806914e-02],
       [  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       ..., 
       [  2.45088560e+00,   1.33445664e+00,   8.61866085e-03, ...,
          2.16530139e+00,   1.11014401e+00,   7.27190480e-02],
       [  1.44842898e+00,   1.78036178e+00,   1.63637896e-02, ...,
          2.42116587e+00,   1.44029417e+00,   6.05693583e-02],
       [  2.55351978e+00,   1.21376702e+00,   7.36240344e-03, ...,
          1.78493302e+00,   1.08432143e+00,   3.03916923e-02]])

In [6]:
max_depth = 20

# Random forest with multioutput regressor
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=4,
                                                          random_state=2))
regr_multirf.fit(X_train, y_train)
mo_rf_result = regr_multirf.predict(X_test)
mo_rf_result_train = regr_multirf.predict(X_train)
mo_rf_result

array([[  2.44027535e-01,   4.05982200e-01,   1.24215584e-03, ...,
          5.57186848e-01,   3.62225197e-01,   7.43668074e-02],
       [  1.14187873e+00,   6.82415619e-01,   1.97420508e-03, ...,
          1.60977657e+00,   7.42296875e-01,   3.84326503e-02],
       [  2.44027535e-01,   4.05982200e-01,   1.24215584e-03, ...,
          5.57186848e-01,   3.62225197e-01,   7.43668074e-02],
       ..., 
       [  1.14187873e+00,   9.89590494e-01,   2.46667733e-03, ...,
          2.34907787e+00,   8.57687479e-01,   4.29328681e-02],
       [  1.14187873e+00,   1.34701634e+00,   4.81519949e-03, ...,
          2.36881441e+00,   1.22606102e+00,   4.90408470e-02],
       [  1.97228942e+00,   1.13033846e+00,   1.23366549e-02, ...,
          1.68077129e+00,   1.40915761e+00,   4.22832615e-02]])

In [7]:
# Just random forest
regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=0, max_features='log2')
regr_rf.fit(X_train, y_train)
y_rf = regr_rf.predict(X_test)
y_rf_train = regr_rf.predict(X_train)
y_rf

array([[  2.38299912e-01,   4.22340693e-01,   7.43289206e-04, ...,
          6.01756912e-01,   3.70631341e-01,   7.43517491e-02],
       [  6.35430353e-01,   6.14949890e-01,   0.00000000e+00, ...,
          1.30060807e+00,   1.20684558e-01,   6.89181869e-02],
       [  2.38299912e-01,   4.22340693e-01,   7.43289206e-04, ...,
          6.01756912e-01,   3.70631341e-01,   7.43517491e-02],
       ..., 
       [  9.52030205e-01,   5.27356158e-01,   0.00000000e+00, ...,
          1.70431986e+00,   1.08471462e+00,   1.01077051e-02],
       [  8.83127706e-01,   7.44898565e-01,   0.00000000e+00, ...,
          1.31312452e+00,   1.26709617e+00,   3.50222816e-02],
       [  1.24169225e+00,   6.45315437e-01,   0.00000000e+00, ...,
          1.35422073e+00,   2.31376554e-01,   5.38216795e-02]])

In [8]:
crimes = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults',
       'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

In [9]:
result1 = []
result1_train = []
for i in range(len(crimes)):
    result1.append([np.sqrt(mean_squared_error(y_test[crimes[i]], linear_result[:,i])), crimes[i], 'Test: Linear Regression'])
    result1_train.append([np.sqrt(mean_squared_error(y_train[crimes[i]], linear_result_train[:,i])), crimes[i], 'Train: Linear Regression'])
result1

[[1.6931604960752744, 'Assault', 'Test: Linear Regression'],
 [1.3384826875062119, 'Break and Enter', 'Test: Linear Regression'],
 [0.06607571655220576, 'Homicide', 'Test: Linear Regression'],
 [0.54111440161810143, 'Robbery', 'Test: Linear Regression'],
 [0.43139676138785576, 'Sexual Assaults', 'Test: Linear Regression'],
 [1.9536388941698706, 'Theft From Vehicle', 'Test: Linear Regression'],
 [1.1055175972417137, 'Theft Of Vehicle', 'Test: Linear Regression'],
 [0.27690759204934789, 'Theft Over $5000', 'Test: Linear Regression']]

In [10]:
result2 = []
result2_train = []
for i in range(len(crimes)):
    result2.append([np.sqrt(mean_squared_error(y_test[crimes[i]], mo_rf_result[:,i])), crimes[i], 'Test: Multi Random Forest with Multioutput Regressor'])
    result2_train.append([np.sqrt(mean_squared_error(y_train[crimes[i]], mo_rf_result_train[:,i])), crimes[i], 'Train: Multi Random Forest with Multioutput Regressor'])
result2

[[1.4855457449408931,
  'Assault',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.3331433783677902,
  'Break and Enter',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.067034624731854217,
  'Homicide',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.51440571413021385,
  'Robbery',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.43029289816663757,
  'Sexual Assaults',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.9108550136455225,
  'Theft From Vehicle',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.1014073974695837,
  'Theft Of Vehicle',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.27660733777378127,
  'Theft Over $5000',
  'Test: Multi Random Forest with Multioutput Regressor']]

In [11]:
result3 = []
result3_train = []
for i in range(len(crimes)):
    result3.append([np.sqrt(mean_squared_error(y_test[crimes[i]], y_rf[:,i])), crimes[i], 'Test: Multi Random Forest'])
    result3_train.append([np.sqrt(mean_squared_error(y_train[crimes[i]], y_rf_train[:,i])), crimes[i], 'Train: Multi Random Forest'])
result3

[[1.3749494517636933, 'Assault', 'Test: Multi Random Forest'],
 [1.3215195307697483, 'Break and Enter', 'Test: Multi Random Forest'],
 [0.068557466888134563, 'Homicide', 'Test: Multi Random Forest'],
 [0.51160384759726607, 'Robbery', 'Test: Multi Random Forest'],
 [0.43601613970991726, 'Sexual Assaults', 'Test: Multi Random Forest'],
 [1.8669324985554929, 'Theft From Vehicle', 'Test: Multi Random Forest'],
 [1.0948030110257101, 'Theft Of Vehicle', 'Test: Multi Random Forest'],
 [0.28107193305949651, 'Theft Over $5000', 'Test: Multi Random Forest']]

In [12]:
result2.extend(result3)
result2_train.extend(result3_train)
# result1.extend(result3)
result2

[[1.4855457449408931,
  'Assault',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.3331433783677902,
  'Break and Enter',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.067034624731854217,
  'Homicide',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.51440571413021385,
  'Robbery',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.43029289816663757,
  'Sexual Assaults',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.9108550136455225,
  'Theft From Vehicle',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.1014073974695837,
  'Theft Of Vehicle',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [0.27660733777378127,
  'Theft Over $5000',
  'Test: Multi Random Forest with Multioutput Regressor'],
 [1.3749494517636933, 'Assault', 'Test: Multi Random Forest'],
 [1.3215195307697483, 'Break and Enter', 'Test: Multi Random Forest'],
 [0.068557466888134563, 'Homicide', 'Test: Multi Random Forest'

In [13]:
final = pd.DataFrame(data=result2)
final.columns = ['RMSE', 'Crime Type', 'Method']
final.sort_values(by=['Crime Type'], inplace=True)
final

Unnamed: 0,RMSE,Crime Type,Method
0,1.485546,Assault,Test: Multi Random Forest with Multioutput Reg...
8,1.374949,Assault,Test: Multi Random Forest
1,1.333143,Break and Enter,Test: Multi Random Forest with Multioutput Reg...
9,1.32152,Break and Enter,Test: Multi Random Forest
2,0.067035,Homicide,Test: Multi Random Forest with Multioutput Reg...
10,0.068557,Homicide,Test: Multi Random Forest
3,0.514406,Robbery,Test: Multi Random Forest with Multioutput Reg...
11,0.511604,Robbery,Test: Multi Random Forest
4,0.430293,Sexual Assaults,Test: Multi Random Forest with Multioutput Reg...
12,0.436016,Sexual Assaults,Test: Multi Random Forest


In [14]:
# Linear Regression
final1 = [x[0] for x in result1]
final1

[1.6931604960752744,
 1.3384826875062119,
 0.06607571655220576,
 0.54111440161810143,
 0.43139676138785576,
 1.9536388941698706,
 1.1055175972417137,
 0.27690759204934789]

In [15]:
# Linear Regression train
final1_train = [x[0] for x in result1_train]
final1_train

[1.7879716351256758,
 1.3994737736305458,
 0.068929110845428077,
 0.55429691253627422,
 0.40639756285775003,
 1.9035134804485057,
 1.1489845830377428,
 0.29773143656868267]

In [16]:
# Multitask Random Forest (with Multioutput Regressor)
final2 = [x[0] for x in result2]
final2[:8]

[1.4855457449408931,
 1.3331433783677902,
 0.067034624731854217,
 0.51440571413021385,
 0.43029289816663757,
 1.9108550136455225,
 1.1014073974695837,
 0.27660733777378127]

In [17]:
# Multitask Random Forest (with Multioutput Regressor) train
final2_train = [x[0] for x in result2_train]
final2_train[:8]

[1.5621251064209223,
 1.3692983258273925,
 0.068124536150459103,
 0.53287771373628612,
 0.39259163078858234,
 1.8680287034845902,
 1.1350660127129353,
 0.29480147269065327]

In [18]:
# Multitask Random Forest
final3 = [x[0] for x in result3]
final3

[1.3749494517636933,
 1.3215195307697483,
 0.068557466888134563,
 0.51160384759726607,
 0.43601613970991726,
 1.8669324985554929,
 1.0948030110257101,
 0.28107193305949651]

In [19]:
# Multitask Random Forest train
final3_train = [x[0] for x in result3_train]
final3_train

[1.4105436695502018,
 1.3110160636988262,
 0.067460996812193433,
 0.50515335515248738,
 0.38141847762168168,
 1.7689129601816131,
 1.0838735953132739,
 0.29083826598015994]