In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = "{:.8f}".format

import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internalgelsd")

import math
from IPython.display import display
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

sns.set_style("white")


In [34]:
df = pd.read_csv("df_crime")

In [35]:
dftest = pd.read_csv("dftest_crime")

### Validating the Regression Model

In [36]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,...,Larceny,Motor Vehicle Theft,Arson,MurderCat,Pop Squared,RobberyCat,ArsonCat,CarTheftCat,Predicted,Residual
0,0,0,Adams Village,1861.0,0,0.0,,0,0.0,0,...,10,0.0,0.0,0.0,3463321.0,0.0,0,0,-18.80897778,-30.80897778
1,1,1,Addison Town and Village,2577.0,3,0.0,,0,0.0,3,...,20,1.0,0.0,0.0,6640929.0,0.0,0,1,-7.36603204,-31.36603204
2,2,2,Akron Village,2846.0,3,0.0,,0,0.0,3,...,15,0.0,0.0,0.0,8099716.0,0.0,0,0,6.6846667,-9.3153333
3,3,4,Albion Village,6388.0,23,0.0,,3,4.0,16,...,165,5.0,,0.0,40806544.0,1.0,0,1,80.03813105,-142.96186895
4,4,5,Alfred Village,4089.0,5,0.0,,0,3.0,2,...,36,0.0,,0.0,16719921.0,1.0,0,0,27.62418104,-18.37581896


In [37]:
dftest.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,PropertyCrime,Burglary,Larceny,Motor Vehicle Theft,Arson,MurderCat,RobberyCat,ArsonCat,CarTheftCat
0,0,0,Adams Village,1851.0,0.0,0.0,,0.0,0.0,0.0,11.0,1.0,10.0,0.0,0.0,0.0,0.0,0,0
1,1,1,Addison Town and Village,2568.0,2.0,0.0,,0.0,1.0,1.0,49.0,1.0,47.0,1.0,0.0,0.0,1.0,0,1
2,2,2,Afton Village4,820.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
3,3,3,Akron Village,2842.0,1.0,0.0,,0.0,0.0,1.0,17.0,0.0,17.0,0.0,0.0,0.0,0.0,0,0
4,4,4,Albany4,98595.0,802.0,8.0,54.0,,237.0,503.0,3888.0,683.0,3083.0,122.0,12.0,1.0,2.0,1,2


In [38]:
linear_formula = "PropertyCrime ~ Population+MurderCat+RobberyCat+CarTheftCat+ArsonCat"

#lm = smf.ols(formula=linear_formula, data=df).fit()

In [39]:
lm = smf.ols(formula=linear_formula, data=df).fit()

In [40]:
lm.params


Intercept     -61.89739300
Population      0.01775231
MurderCat      -4.35042795
RobberyCat     88.37490712
CarTheftCat    19.63899835
ArsonCat       98.88962120
dtype: float64

In [41]:
lm.pvalues
#MurderCat and CarTheftCat are both above a value of .05, so they are probably not having a significant impact on the model

Intercept     0.00004386
Population    0.00000000
MurderCat     0.89342920
RobberyCat    0.00000349
CarTheftCat   0.30366764
ArsonCat      0.00682042
dtype: float64

In [42]:
lm.rsquared

0.7974572096661988

In [43]:
linear_formula = "PropertyCrime ~ Population+RobberyCat+ArsonCat"

In [44]:
lm = smf.ols(formula=linear_formula, data=df).fit()

In [45]:
lm.params

Intercept    -53.92756985
Population     0.01801319
RobberyCat    95.80392363
ArsonCat     101.94625217
dtype: float64

In [46]:
lm.rsquared
#removing those two parameters did not affect the rsquared value

0.7967955533644241

In [47]:
dftest.dtypes

Unnamed: 0                   int64
Unnamed: 0.1                 int64
City                        object
Population                 float64
Violent Crime              float64
Murder and Manslaughter    float64
Rape - revised             float64
Rape - legacy              float64
Robbery                    float64
Aggravated Assault         float64
PropertyCrime              float64
Burglary                   float64
Larceny                    float64
Motor Vehicle Theft        float64
Arson                      float64
MurderCat                  float64
RobberyCat                 float64
ArsonCat                     int64
CarTheftCat                  int64
dtype: object

In [64]:
model1 = "PropertyCrime ~ Population+RobberyCat+ArsonCat"

lm = smf.ols(formula=model1, data=dftest).fit()

In [65]:
lm.params

Intercept    -88.62122513
Population     0.01602282
RobberyCat   200.57681150
ArsonCat     242.86279495
dtype: float64

In [66]:
lm.pvalues

Intercept    0.03443675
Population   0.00000000
RobberyCat   0.00002710
ArsonCat     0.00850394
dtype: float64

In [50]:
lm.rsquared

0.9933762819548052

In [51]:
#testing a model with more continuous features (as opposed to so many caterical features)

model2 = "PropertyCrime ~ Population+Robbery+Arson"

lm = smf.ols(formula=model2, data=df).fit()

In [52]:
lm.rsquared

#the rsquared with the continuous features is much higher

0.8297644875716378

In [53]:
#trying the continuous features on the test data

lm = smf.ols(formula=model2, data=dftest).fit()

lm.rsquared

#it made the test data rsquared number go down a bit (is it better than .99 because that could be overfitting?)

0.878377925494709

In [54]:
lm.params



Intercept     9.25451302
Population    0.01495999
Robbery       1.90778167
Arson        36.32246690
dtype: float64

In [55]:
lm.pvalues

Intercept    0.57688065
Population   0.00000000
Robbery      0.00313864
Arson        0.00000000
dtype: float64

In [58]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,...,Larceny,Motor Vehicle Theft,Arson,MurderCat,Pop Squared,RobberyCat,ArsonCat,CarTheftCat,Predicted,Residual
0,0,0,Adams Village,1861.0,0,0.0,,0,0.0,0,...,10,0.0,0.0,0.0,3463321.0,0.0,0,0,-18.80897778,-30.80897778
1,1,1,Addison Town and Village,2577.0,3,0.0,,0,0.0,3,...,20,1.0,0.0,0.0,6640929.0,0.0,0,1,-7.36603204,-31.36603204
2,2,2,Akron Village,2846.0,3,0.0,,0,0.0,3,...,15,0.0,0.0,0.0,8099716.0,0.0,0,0,6.6846667,-9.3153333
3,3,4,Albion Village,6388.0,23,0.0,,3,4.0,16,...,165,5.0,,0.0,40806544.0,1.0,0,1,80.03813105,-142.96186895
4,4,5,Alfred Village,4089.0,5,0.0,,0,3.0,2,...,36,0.0,,0.0,16719921.0,1.0,0,0,27.62418104,-18.37581896


In [59]:
#checking the rsquared of both models on both the training set (2013) and test set (2014)

model1 = "PropertyCrime ~ Population+RobberyCat+ArsonCat"   #categorical features
model2 = "PropertyCrime ~ Population+Robbery+Arson"   #continuous features



lm1_train = smf.ols(formula=model1, data=df).fit()
lm2_train = smf.ols(formula=model2, data=df).fit()
lm1_test = smf.ols(formula=model1, data=dftest).fit()
lm2_test = smf.ols(formula=model2, data=dftest).fit()

print("Model 1 on Training Set: ", lm1_train.rsquared)
print("Model 2 on Training Set: ", lm2_train.rsquared)
print("Model 1 on Test Set: ", lm1_test.rsquared)
print("Model 2 on Test Set: ", lm2_test.rsquared)


Model 1 on Training Set:  0.7967955533644241
Model 2 on Training Set:  0.8297644875716378
Model 1 on Test Set:  0.9933762819548052
Model 2 on Test Set:  0.878377925494709


In [60]:
df.to_csv("df_crime")

In [61]:
dftest.to_csv("dftest_crime")

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [69]:
df.shape

(332, 22)

In [70]:
332/5


66.4

In [74]:
#using the stronger model (model 2 - continuous variables) to test on 5 folds in the training dataset (looking for rsquared)
#found that this model returned very consistent results on each fold
fold1 = df.iloc[0:66, :]
fold2 = df.iloc[66:122, :]
fold3 = df.iloc[122:188, :]
fold4 = df.iloc[188:254, :]
fold5 = df.iloc[254:, :]

folds = [fold1, fold2, fold3, fold4, fold5]

for fold in folds:
    model2 = "PropertyCrime ~ Population+Robbery+Arson"
    lm = smf.ols(formula=model2, data=df).fit()
    print(lm.rsquared)
    

0.8297644875716378
0.8297644875716378
0.8297644875716378
0.8297644875716378
0.8297644875716378


In [75]:
dftest.shape

(376, 19)

In [76]:
376/5


75.2

In [77]:
#using the stronger model (model 2 - continuous variables) to test on 5 folds in the test (2014) dataset (looking for rsquared)
#found that this model returned very consistent results on each fold in the test dataset
fold1 = dftest.iloc[0:75, :]
fold2 = dftest.iloc[75:150, :]
fold3 = dftest.iloc[150:225, :]
fold4 = dftest.iloc[225:300, :]
fold5 = dftest.iloc[300:, :]

folds = [fold1, fold2, fold3, fold4, fold5]

for fold in folds:
    model2 = "PropertyCrime ~ Population+Robbery+Arson"
    lm = smf.ols(formula=model2, data=dftest).fit()
    print(lm.rsquared)

0.878377925494709
0.878377925494709
0.878377925494709
0.878377925494709
0.878377925494709


In the end, the model with the continuous features explained more of the variance than the model with categorical features. 

In order to validate/test the model, I ran both models on a new set of data. The training set was New York crime data from 2013, and the test set was from 2014. I also tested out the stronger model (the one with continuous variables) on 5 folds of each set of data. The r-squared value was consistent for each holdout group.