In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = "{:.3f}".format

import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internalgelsd")

import math
from IPython.display import display
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

sns.set_style("white")


In [210]:
df = pd.read_csv("df_crime")

In [211]:
dftest = pd.read_csv("dftest_crime")

### Validating the Regression Model

In [212]:
df.head()

Unnamed: 0.1,Unnamed: 0,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,PropertyCrime,...,Larceny,Motor Vehicle Theft,Arson,MurderCat,Pop Squared,RobberyCat,ArsonCat,CarTheftCat,Predicted,Residual
0,0,Adams Village,1861.0,0,0.0,,0,0.0,0,12.0,...,10,0.0,0.0,0.0,3463321.0,0.0,0,0,-18.809,-30.809
1,1,Addison Town and Village,2577.0,3,0.0,,0,0.0,3,24.0,...,20,1.0,0.0,0.0,6640929.0,0.0,0,1,-7.366,-31.366
2,2,Akron Village,2846.0,3,0.0,,0,0.0,3,16.0,...,15,0.0,0.0,0.0,8099716.0,0.0,0,0,6.685,-9.315
3,4,Albion Village,6388.0,23,0.0,,3,4.0,16,223.0,...,165,5.0,,0.0,40806544.0,1.0,0,1,80.038,-142.962
4,5,Alfred Village,4089.0,5,0.0,,0,3.0,2,46.0,...,36,0.0,,0.0,16719921.0,1.0,0,0,27.624,-18.376


In [213]:
dftest.head()

Unnamed: 0.1,Unnamed: 0,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,PropertyCrime,Burglary,Larceny,Motor Vehicle Theft,Arson,MurderCat,RobberyCat,ArsonCat,CarTheftCat
0,0,Adams Village,1851.0,0.0,0.0,,0.0,0.0,0.0,11.0,1.0,10.0,0.0,0.0,0.0,0.0,0,0
1,1,Addison Town and Village,2568.0,2.0,0.0,,0.0,1.0,1.0,49.0,1.0,47.0,1.0,0.0,0.0,1.0,0,1
2,2,Afton Village4,820.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0
3,3,Akron Village,2842.0,1.0,0.0,,0.0,0.0,1.0,17.0,0.0,17.0,0.0,0.0,0.0,0.0,0,0
4,4,Albany4,98595.0,802.0,8.0,54.0,,237.0,503.0,3888.0,683.0,3083.0,122.0,12.0,1.0,2.0,1,2


In [121]:
linear_formula = "PropertyCrime ~ Population+MurderCat+RobberyCat+CarTheftCat+ArsonCat"

#lm = smf.ols(formula=linear_formula, data=df).fit()

In [122]:
lm = smf.ols(formula=linear_formula, data=df).fit()

In [123]:
lm.params


Intercept     -61.897
Population      0.018
MurderCat      -4.350
RobberyCat     88.375
CarTheftCat    19.639
ArsonCat       98.890
dtype: float64

In [124]:
lm.pvalues
#MurderCat and CarTheftCat are both above a value of .05, so they are probably not having a significant impact on the model

Intercept     0.000
Population    0.000
MurderCat     0.893
RobberyCat    0.000
CarTheftCat   0.304
ArsonCat      0.007
dtype: float64

In [125]:
lm.rsquared

0.7974572096661988

In [126]:
linear_formula = "PropertyCrime ~ Population+RobberyCat+ArsonCat"

In [127]:
lm = smf.ols(formula=linear_formula, data=df).fit()

In [128]:
lm.params

Intercept    -53.928
Population     0.018
RobberyCat    95.804
ArsonCat     101.946
dtype: float64

In [129]:
lm.rsquared
#removing those two parameters did not affect the rsquared value

0.7967955533644241

In [143]:
dftest.dtypes

City                                       object
Population                                float64
Violent\ncrime                            float64
Murder and\nnonnegligent\nmanslaughter    float64
Rape\n(revised\ndefinition)1              float64
Rape\n(legacy\ndefinition)2               float64
Robbery                                   float64
Aggravated\nassault                       float64
Property\ncrime                           float64
Burglary                                  float64
Larceny-\ntheft                           float64
Motor\nvehicle\ntheft                     float64
Arson3                                    float64
murder                                    float64
dtype: object

In [180]:
model1 = "PropertyCrime ~ Population+RobberyCat+ArsonCat"

lm = smf.ols(formula=model1, data=dftest).fit()

In [181]:
lm.params

Intercept    -88.621
Population     0.016
RobberyCat   200.577
ArsonCat     242.863
dtype: float64

In [160]:
lm.rsquared

0.9933762819548052

In [182]:
#testing a model with more continuous features (as opposed to so many caterical features)

model2 = "PropertyCrime ~ Population+Robbery+Arson"

lm = smf.ols(formula=model2, data=df).fit()

In [183]:
lm.rsquared

#the rsquared with the continuous features is much higher

0.8297644875716378

In [185]:
#trying the continuous features on the test data

lm = smf.ols(formula=model2, data=dftest).fit()

lm.rsquared

#it made the test data rsquared number go down a bit (is it better than .99 because that could be overfitting?)

0.878377925494709

In [186]:
lm.params

#Population is not a feature that is playing a significant role

Intercept     9.255
Population    0.015
Robbery       1.908
Arson        36.322
dtype: float64

In [187]:
linear_formula = "PropertyCrime ~ Robbery+Arson"

lm = smf.ols(formula=linear_formula, data=dftest).fit()

lm.rsquared

0.7976293781608099

In [197]:
#removing population had a 9 percentage point impact on rsquared, so i'm going to put it back into the model
linear_formula = "PropertyCrime ~ Population+Robbery+Arson"

lm = smf.ols(formula=linear_formula, data=dftest).fit()

lm.rsquared


0.878377925494709

In [214]:
df.head()

Unnamed: 0.1,Unnamed: 0,City,Population,Violent Crime,Murder and Manslaughter,Rape - revised,Rape - legacy,Robbery,Aggravated Assault,PropertyCrime,...,Larceny,Motor Vehicle Theft,Arson,MurderCat,Pop Squared,RobberyCat,ArsonCat,CarTheftCat,Predicted,Residual
0,0,Adams Village,1861.0,0,0.0,,0,0.0,0,12.0,...,10,0.0,0.0,0.0,3463321.0,0.0,0,0,-18.809,-30.809
1,1,Addison Town and Village,2577.0,3,0.0,,0,0.0,3,24.0,...,20,1.0,0.0,0.0,6640929.0,0.0,0,1,-7.366,-31.366
2,2,Akron Village,2846.0,3,0.0,,0,0.0,3,16.0,...,15,0.0,0.0,0.0,8099716.0,0.0,0,0,6.685,-9.315
3,4,Albion Village,6388.0,23,0.0,,3,4.0,16,223.0,...,165,5.0,,0.0,40806544.0,1.0,0,1,80.038,-142.962
4,5,Alfred Village,4089.0,5,0.0,,0,3.0,2,46.0,...,36,0.0,,0.0,16719921.0,1.0,0,0,27.624,-18.376


In [207]:
#checking the rsquared of both models on both the training set (2013) and test set (2014)

model1 = "PropertyCrime ~ Population+RobberyCat+ArsonCat"   #categorical features
model2 = "PropertyCrime ~ Population+Robbery+Arson"   #continuous features



lm1_train = smf.ols(formula=model1, data=df).fit()
lm2_train = smf.ols(formula=model2, data=df).fit()
lm1_test = smf.ols(formula=model1, data=dftest).fit()
lm2_test = smf.ols(formula=model2, data=dftest).fit()

print("Model 1 on Training Set: ", lm1_train.rsquared)
print("Model 2 on Training Set: ", lm2_train.rsquared)
print("Model 1 on Test Set: ", lm1_test.rsquared)
print("Model 2 on Test Set: ", lm2_test.rsquared)


Model 1 on Training Set:  0.7967955533644241
Model 2 on Training Set:  0.8297644875716378
Model 1 on Test Set:  0.9933762819548052
Model 2 on Test Set:  0.878377925494709


In the end, the model with the continuous features explained more of the variance than the model with categorical features. 

In order to validate/test the model, I ran both models on a new set of data. The training set was New York crime data from 2013, and the test set was from 2014.

In [208]:
df.to_csv("df_crime")

In [209]:
dftest.to_csv("dftest_crime")

In [192]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [203]:
#need to practice cross validation with 10 folds for a regression model

target = df["PropertyCrime"]


cross_val_score(smf.ols(), model2, target, cv=10)

TypeError: from_formula() missing 2 required positional arguments: 'formula' and 'data'

In [200]:
#need assistance testing on the 20% holdout

target = df["PropertyCrime"]
features = df[["Population", "Robbery", "Arson"]]


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=20)



smf.ols(forumla=linear_formula, data=df).fit(X_train, y_train).score(X_test, y_test)
#print("Testing on Sample: " + str(bnb.fit(data, target).score(data, target)))
#cross_val_score(bnb, data, target, cv=10)

TypeError: from_formula() missing 1 required positional argument: 'formula'