In [41]:
import math
import numpy as np
import pandas as pd
from IPython.display import display
import scipy
import sklearn
import os
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix

os.getcwd()
os.chdir('/Users/kylehoward/desktop/thinkful/thinkful-2') 
os.getcwd()

'/Users/kylehoward/Desktop/Thinkful/Thinkful-2'

In [None]:
''' 
This is my first regression project. The dataset is from UCR and is the FBI's crime statistics for neighborhoods in 
New York. The goal is to predict property crime for each neighborhood. The square root of the population seemed to be
a very effective feature. I want to dig further into how they define the crimes in the dataset as many crimes 
have similarities and that could explain some of the performance of the model. 
'''

In [42]:
# Importing and cleaning the 2013 New York crime data
data = ("FBI_data_4.csv")
fbi = pd.read_csv(data, usecols=[1,2,3,6,8,11,12])
fbi.columns = ['population', 'violent_crime', 'murder', 'robbery', 'prop_crime', 'gta', 'arsen']
fbi = fbi.replace([np.inf, -np.inf], np.nan).dropna()
fbi = fbi[1:]
fbi = fbi.dropna()

In [43]:
# Feature building where we are manipulating population to try and expand on more subtle patterns 
fbi['population'] = fbi['population'].astype(int)
fbi['population_squared'] = fbi['population'] * fbi['population']
fbi['population_squared'].head(10)

1         6640929
2         8099716
6         3171961
7     13993943616
8        90611361
9       330585124
10        4214809
11       20457529
12         432964
13        6360484
Name: population_squared, dtype: int64

In [44]:
# Feature building where we are manipulating population 
fbi['population_sqrt'] = fbi['population'] ** .5
fbi['population_sqrt'] = fbi['population_sqrt'].astype(int)
fbi['population_sqrt'].head(10)

1      50
2      53
6      42
7     343
8      97
9     134
10     45
11     67
12     25
13     50
Name: population_sqrt, dtype: int64

In [45]:
# Creating categorical features 
fbi['murder'] = fbi['murder'].astype(int)
fbi['has_murder'] = np.where(fbi['murder']>=1, 1, 0)
fbi['has_murder'].head(10)

1     0
2     0
6     0
7     1
8     0
9     0
10    0
11    0
12    0
13    0
Name: has_murder, dtype: int64

In [46]:
# Creating more categorical features and cleaning the data
fbi['robbery'] = fbi['robbery'].astype(int)
fbi['has_robbery'] = np.where(fbi['robbery']>=1, 1, 0)
fbi['arsen'] = fbi['arsen'].astype(int)
fbi['has_robbery'].head(10)

1     0
2     0
6     0
7     1
8     1
9     1
10    0
11    1
12    0
13    0
Name: has_robbery, dtype: int64

In [47]:
# Making sure all values are integers
fbi['prop_crime'].astype(int)
fbi['population_sqrt'].astype(int)
fbi['violent_crime'].astype(int)
fbi['arsen'].astype(int)
fbi['gta'].astype(int)

1        1
2        0
6        0
7       32
8        6
9       15
10       1
11       1
12       0
13       1
16       1
17       0
19       2
23       0
25       0
26       0
27       0
28       2
29       0
30       0
33       3
34       0
38       0
39       5
42       0
44       0
45       9
46       0
48       0
51       2
      ... 
283      0
287      0
292      0
293      0
295      0
297      0
298      2
300      6
301     49
306      2
309      1
310    394
311      4
313      2
314     27
315     87
317      1
319      0
324     26
326      0
327      2
328      1
335      0
336      3
337      3
338     20
341      1
342      3
344      0
346    236
Name: gta, Length: 186, dtype: int64

In [48]:
# ------------------------------------------sklearn OLS Regression -------------------------------------
# Splitting into training and test sets as well as training the model
# Since the dataset is short attempting to train with 54% to assure decent sample size.
# When training set was 80% R2 was .95 and seemed like overfitting
regr = linear_model.LinearRegression()
Y = fbi['prop_crime'].values.reshape(-1, 1)

# These features were the most successful after trial and error
X = fbi[['gta', 'population_sqrt', 'violent_crime', 'arsen']]
X_training = X[:100]
Y_training = Y[:100]
X_test = X[100:]
Y_test = Y[100:]
regr.fit(X_training, Y_training)
y_pred_test = regr.predict(X_test)



In [49]:
# Inspecting the training outputs
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X_training, Y_training))


Coefficients: 
 [[ 17.30987145   4.02844667  -3.45162097  21.30677974]]

Intercept: 
 [-208.19966493]

R-squared:
0.781025167054


In [50]:
# Inspecting the test outputs
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X_test, Y_test))


Coefficients: 
 [[ 17.30987145   4.02844667  -3.45162097  21.30677974]]

Intercept: 
 [-208.19966493]

R-squared:
0.916290908524


In [51]:
# -------------------------------------- statsmodels OLS Regression ---------------------------
# Using statsmodels to review and test feature importance
linear_formula = 'prop_crime ~ gta+population_sqrt+violent_crime+arsen'
lm = smf.ols(formula=linear_formula, data=fbi).fit()

In [52]:
# Inspecting coefficients
lm.params

Intercept         -178.889203
gta                 11.623850
population_sqrt      3.652602
violent_crime       -1.546253
arsen               38.042049
dtype: float64

In [53]:
# Inspecting feature p values
# It seems that all features have an effect or are < .05
lm.pvalues

Intercept          1.137841e-08
gta                1.108854e-10
population_sqrt    5.795206e-24
violent_crime      1.491665e-04
arsen              3.553459e-16
dtype: float64

In [54]:
# Inspecting r2
lm.rsquared

0.95833936944267883

In [55]:
# Inspecting confidence intervals
lm.conf_int()

Unnamed: 0,0,1
Intercept,-237.870404,-119.908001
gta,8.276037,14.971663
population_sqrt,3.037564,4.267639
violent_crime,-2.333693,-0.758813
arsen,29.679695,46.404403
