In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.multioutput import MultiOutputRegressor

In [2]:
df = pd.read_csv('../merged_data/ready.csv')
df = df.fillna(0)

In [3]:
df.columns
df = df[['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public', 'Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000']]

In [4]:
def train_test_spliter(df, random_seed=0, test_size=0.20, n_splits=1):
    # trainm = preprocessing.maxabs_scale(df, axis=0, copy=True)
    col = df.columns
    df1 = pd.DataFrame(data=df, columns=col)
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_seed)
    for train_index, test_index in rs.split(df):
        pass
    traindf = pd.DataFrame(data=df, index=train_index)
    testdf = pd.DataFrame(data=df, index=test_index)
    return traindf, testdf


traindf, testdf = train_test_spliter(df)

X_train = pd.DataFrame(data=traindf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_train = preprocessing.maxabs_scale(X_train, axis=0, copy=True)

y_train = pd.DataFrame(data=traindf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])
X_test = pd.DataFrame(data=testdf, columns=['Canadian Citizen', 'No Response(Citizen)', 'Non-Canadian Citizen',
       'Refugee', 'No Longer In Use', 'Occupied', 'Unoccupied',
       'Employed 0-30 Hours', 'Employed 30+ Hours', 'Gr.10 - Gr.12',
       'Gr.7 - Gr.9', 'Homemaker', 'Kindergarten - Gr.6',
       'No Response(Employment)', 'Permanently Unable to Work',
       'Post Secondary Student', 'Preschool', 'Retired', 'Unemployed',
       'Common Law', 'Married', 'Never Married', 'No Response(Marital)',
       'Separated/Divorced', 'Widowed', 'Bicycle',
       'Car/Truck/Van (as Driver)', 'Car/Truck/Van (as Passenger)',
       'No Response(Transportation)', 'Other', 'Public Transit', 'Walk',
       'Catholic', 'No Response(School)', 'Public'])
X_test = preprocessing.maxabs_scale(X_test, axis=0, copy=True)

y_test = pd.DataFrame(data=testdf, columns=['Assault', 'Break and Enter', 'Homicide',
       'Robbery', 'Sexual Assaults', 'Theft From Vehicle', 'Theft Of Vehicle',
       'Theft Over $5000'])

In [5]:
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
linear_result = reg.predict(X_test)
linear_result

array([[  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       [  8.75780264e-01,   5.71679490e-01,   8.81496964e-04, ...,
          1.34451825e+00,   6.68084623e-01,   2.46806914e-02],
       [  2.75972493e-01,   3.63248189e-01,   6.04786389e-04, ...,
          5.19153888e-01,   2.99171625e-01,   6.51274750e-02],
       ..., 
       [  2.45088560e+00,   1.33445664e+00,   8.61866085e-03, ...,
          2.16530139e+00,   1.11014401e+00,   7.27190480e-02],
       [  1.44842898e+00,   1.78036178e+00,   1.63637896e-02, ...,
          2.42116587e+00,   1.44029417e+00,   6.05693583e-02],
       [  2.55351978e+00,   1.21376702e+00,   7.36240344e-03, ...,
          1.78493302e+00,   1.08432143e+00,   3.03916923e-02]])

In [6]:
max_depth = 12

# Random forest with multioutput regressor
regr_multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth,
                                                          random_state=2))
regr_multirf.fit(X_train, y_train)
mo_rf_result = regr_multirf.predict(X_test)
mo_rf_result

array([[  2.41437084e-01,   4.22024327e-01,   7.51598453e-04, ...,
          6.00742877e-01,   3.72825375e-01,   7.39840861e-02],
       [  6.96705706e-01,   5.76990006e-01,   2.19780220e-05, ...,
          1.17890319e+00,   2.02084398e-01,   6.67248524e-02],
       [  2.41437084e-01,   4.22024327e-01,   7.51598453e-04, ...,
          6.00742877e-01,   3.72825375e-01,   7.39840861e-02],
       ..., 
       [  1.15334204e+00,   6.41255287e-01,   1.77777778e-04, ...,
          1.48240317e+00,   1.00876521e+00,   1.75959508e-02],
       [  1.09123066e+00,   8.19320113e-01,   2.19780220e-05, ...,
          1.24414864e+00,   1.26124672e+00,   3.38772315e-02],
       [  1.70288922e+00,   7.13111101e-01,   5.83090379e-04, ...,
          1.27160137e+00,   2.36036920e-01,   4.35247253e-02]])

In [7]:
# Just random forest
regr_rf = RandomForestRegressor(max_depth=max_depth, random_state=2)
regr_rf.fit(X_train, y_train)
y_rf = regr_rf.predict(X_test)
y_rf

array([[  2.41568307e-01,   4.21405097e-01,   7.56805855e-04, ...,
          6.00331115e-01,   3.72798732e-01,   7.40563712e-02],
       [  6.39481037e-01,   6.46047387e-01,   2.59893238e-03, ...,
          1.19918256e+00,   5.10671818e-01,   3.87174632e-02],
       [  2.41568307e-01,   4.21405097e-01,   7.56805855e-04, ...,
          6.00331115e-01,   3.72798732e-01,   7.40563712e-02],
       ..., 
       [  1.09123763e+00,   6.83317137e-01,   4.46944215e-03, ...,
          1.49262588e+00,   9.36387477e-01,   8.57222601e-03],
       [  1.17132908e+00,   8.94383026e-01,   2.35610315e-03, ...,
          1.63071908e+00,   1.10231629e+00,   4.91743026e-02],
       [  1.44236828e+00,   7.92838260e-01,   7.00751199e-04, ...,
          1.22696912e+00,   4.39653141e-01,   3.17773256e-02]])

In [8]:
crimes = ['Assault', 'Break and Enter', 'Homicide', 'Robbery', 'Sexual Assaults',
       'Theft From Vehicle', 'Theft Of Vehicle', 'Theft Over $5000']

In [9]:
result1 = []
for i in range(len(crimes)):
    result1.append([np.sqrt(mean_squared_error(y_test[crimes[i]], linear_result[:,i])), crimes[i], 'Linear Regression'])
    
result1

[[1.6931604960752744, 'Assault', 'Linear Regression'],
 [1.3384826875062119, 'Break and Enter', 'Linear Regression'],
 [0.06607571655220576, 'Homicide', 'Linear Regression'],
 [0.54111440161810143, 'Robbery', 'Linear Regression'],
 [0.43139676138785576, 'Sexual Assaults', 'Linear Regression'],
 [1.9536388941698706, 'Theft From Vehicle', 'Linear Regression'],
 [1.1055175972417137, 'Theft Of Vehicle', 'Linear Regression'],
 [0.27690759204934789, 'Theft Over $5000', 'Linear Regression']]

In [10]:
result2 = []
for i in range(len(crimes)):
    result2.append([np.sqrt(mean_squared_error(y_test[crimes[i]], mo_rf_result[:,i])), crimes[i], 'Multi Random Forest (with Multioutput Regressor)'])

result2

[[1.3725428445838139,
  'Assault',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.3205749627932026,
  'Break and Enter',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.068467776053308116,
  'Homicide',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.50850471996035596,
  'Robbery',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.4334847631668296,
  'Sexual Assaults',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.8575913465908769,
  'Theft From Vehicle',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.0899324120453286,
  'Theft Of Vehicle',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.27814700465064562,
  'Theft Over $5000',
  'Multi Random Forest (with Multioutput Regressor)']]

In [11]:
result3 = []
for i in range(len(crimes)):
    result3.append([np.sqrt(mean_squared_error(y_test[crimes[i]], y_rf[:,i])), crimes[i], 'Multi Random Forest'])
    
result3

[[1.3722736112950127, 'Assault', 'Multi Random Forest'],
 [1.3166119071268452, 'Break and Enter', 'Multi Random Forest'],
 [0.067863070698364447, 'Homicide', 'Multi Random Forest'],
 [0.5063804179509549, 'Robbery', 'Multi Random Forest'],
 [0.43201057428284545, 'Sexual Assaults', 'Multi Random Forest'],
 [1.8515625287723361, 'Theft From Vehicle', 'Multi Random Forest'],
 [1.0831639940216897, 'Theft Of Vehicle', 'Multi Random Forest'],
 [0.27764217530543728, 'Theft Over $5000', 'Multi Random Forest']]

In [12]:
result2.extend(result3)
# result1.extend(result3)
result2

[[1.3725428445838139,
  'Assault',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.3205749627932026,
  'Break and Enter',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.068467776053308116,
  'Homicide',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.50850471996035596,
  'Robbery',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.4334847631668296,
  'Sexual Assaults',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.8575913465908769,
  'Theft From Vehicle',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.0899324120453286,
  'Theft Of Vehicle',
  'Multi Random Forest (with Multioutput Regressor)'],
 [0.27814700465064562,
  'Theft Over $5000',
  'Multi Random Forest (with Multioutput Regressor)'],
 [1.3722736112950127, 'Assault', 'Multi Random Forest'],
 [1.3166119071268452, 'Break and Enter', 'Multi Random Forest'],
 [0.067863070698364447, 'Homicide', 'Multi Random Forest'],
 [0.5063804179509549, 'Robbery', 'Multi Random F

In [13]:
final = pd.DataFrame(data=result2)
final.columns = ['RMSE', 'Crime Type', 'Method']
final

Unnamed: 0,RMSE,Crime Type,Method
0,1.372543,Assault,Multi Random Forest (with Multioutput Regressor)
1,1.320575,Break and Enter,Multi Random Forest (with Multioutput Regressor)
2,0.068468,Homicide,Multi Random Forest (with Multioutput Regressor)
3,0.508505,Robbery,Multi Random Forest (with Multioutput Regressor)
4,0.433485,Sexual Assaults,Multi Random Forest (with Multioutput Regressor)
5,1.857591,Theft From Vehicle,Multi Random Forest (with Multioutput Regressor)
6,1.089932,Theft Of Vehicle,Multi Random Forest (with Multioutput Regressor)
7,0.278147,Theft Over $5000,Multi Random Forest (with Multioutput Regressor)
8,1.372274,Assault,Multi Random Forest
9,1.316612,Break and Enter,Multi Random Forest
