# Phase IV Data Analysis

In [1]:
# Load libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

## Linear Regression Analysis of Stop-and Frisk's Efficacy

### Using a Linear Regression Model to Predict the Number of Annual Offenses

This linear regression model will predict the total number of criminal offenses reported annually in NYC from the number of stops recorded annually in NYC. 

The purpose of this model is to investigate whether the NYPD's Stop and Frisk Program had a tangible and quantifiable effect on criminal activity reported in NYC from 2011 to 2019.

In [4]:
# Load Crime Offenses dataset
nyc_crime = pd.read_csv('/Users/katherinevella/Desktop/Data Science/Data-Science-Project-2950-/Final Submission/nyc_crime')
nyc_crime = nyc_crime.drop(columns = 'Unnamed: 0')
nyc_crime.head()

Unnamed: 0,offense,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,status
0,murder,471,536,515,419,335,333,352,335,292,295,319,major_felony
1,rape,1205,1373,1420,1445,1378,1352,1438,1438,1449,1794,1755,major_felony
2,robbery,18601,19486,19717,20144,19128,16539,16931,15500,13956,12913,13371,major_felony
3,felony_assault,16773,16956,18482,19381,20297,20207,20270,20847,20052,20208,20698,major_felony
4,burglary,19430,18600,18720,19168,17429,16765,15125,12990,12083,11687,10783,major_felony


#### Gathering the Data:

In [None]:
# Get the total number of criminal offenses recorded each year (2011-2019)
offense_types = nyc_crime['offense'].count()

annual_offenses = []
for year in nyc_crime.columns[3:12]:
    count = 0
    for offense in range(offense_types):
        count = count + nyc_crime[year][offense]
        
    annual_offenses = annual_offenses + [count]
print('Total Criminal Offenses Reported Annually (2011-2019): ')
print(annual_offenses)

In [None]:
# Get the total number of stops reported

# Source: https://www.nyclu.org/en/stop-and-frisk-data

annual_stops = np.array([685724, 532911, 191851, 45787, 22565, 12404, 11629, 11008, 13450])

print('Total Stops Reported Annually (2011-2019): ')
print(annual_stops)

#### Performing the Linear Regression to Predict Total Offenses:

In [None]:
# Perform linear regression with number of stops each year as predictor var X and total number of offenses as criterion var y

linear_model = LinearRegression().fit(annual_stops.reshape(-1,1), annual_offenses)

# Scatterplot of observed values and predicted values
plt.scatter(annual_stops, annual_offenses, label = 'Observed Values')
predicted_offenses = linear_model.predict(annual_stops.reshape(-1,1))
plt.scatter(annual_stops, predicted_offenses, label = 'Predicted Values')
plt.title('Annual Stops vs. Annual Offenses')
plt.xlabel('Annual Stops')
plt.ylabel('Annual Offenses')
plt.legend(loc='lower right')
plt.show()

In [None]:
# print slope, intercept, and coefficient of determination (r^2)
print('Estimated slope: {:.2f}'.format(linear_model.coef_[0]))
print('Estimated intercept: {:.2f}'.format(linear_model.intercept_))
print('Coefficient of Determination (r^2): {:.2f}'.format(linear_model.score(annual_stops.reshape(-1,1), annual_offenses)))

The estimated slope of this linear regression model is 0.14. This value indicates that as the number of annual stops increases by 1, the number of annual offenses is predicted to increase by 0.14.

We see here that number of stops is a moderate predictor of number of offenses overall. 
The r<sup>2</sup> value (the coefficient of determination) is 0.55, indicating that only 55% of the variation in the number of total annual offenses can be predicted by the number of annual stops.

Let's narrow this down to the two offenses specifically targeted by stop and frisk, drugs and weapons.

The two illegal activities specifically targeted by the NYPD's Stop and Frisk Program are drug and weapon possession. Therefore, it is necessary to perform another linear regression analysis specifically modeling the relationship between the total number of stops reported annually and the number of drug & weapon offenses reported annually in NYC during this period.

### Using a Linear Regression Model to Predict the Number of Annual Drug & Weapons Offenses

This linear regression model will predict the total number of criminal drug & weapons offenses reported annually in NYC from the number of stops recorded annually in NYC. The stop-and-frisk program primarily targets civilians suspected of drug- and/or weapons-related offenses, so one might anticipate that the number of stops to have a greater effect on the rates of these specific crimes than criminal offenses overall.

#### Gathering the Data:

In [None]:
# Get the number of drug & weapons offenses reported annually (2011-2019)
drugs_weapons = nyc_crime.loc[(nyc_crime['offense'] == 'drugs') | (nyc_crime['offense'] == 'weapons')]

annual_drugsWeapons = []
for year in nyc_crime.columns[3:12]:
    count = 0
    for offense in drugs_weapons[year]:
        count = count + offense
    annual_drugsWeapons = annual_drugsWeapons + [count]
        
print('Drug & Weapons Offenses Reported Annual (2011-2019):')
print(annual_drugsWeapons)

### Performing the Linear Regression:

In [None]:
# Perform linear regression with number of stops each year as predictor var X and number of drug & weapons offenses as criterion var y
dw_linear_model = LinearRegression().fit(annual_stops.reshape(-1,1), annual_drugsWeapons)

# Scatterplot of observed values and predicted values
plt.scatter(annual_stops, annual_drugsWeapons, label = 'Observed Values')
predicted_dw = dw_linear_model.predict(annual_stops.reshape(-1,1))
plt.scatter(annual_stops, predicted_dw, label = 'Predicted Values')
plt.title('Annual Stops vs. Annual Offenses (Drug and Weapon Charges)')
plt.xlabel('Annual Stops')
plt.ylabel('Annual Offenses')
plt.legend(loc='lower right')
plt.show()

In [None]:
# print slope, intercept, and r^2
print('Estimated slope: {:.2f}'.format(dw_linear_model.coef_[0]))
print('Estimated intercept: {:.2f}'.format(dw_linear_model.intercept_))
print('r^2: {:.2f}'.format(dw_linear_model.score(annual_stops.reshape(-1,1), annual_drugsWeapons)))

The number of stops serves as a stronger predictor of the number of drug & weapons offenses rather than total offenses.

The r^2 value (the coefficient of determination) of this model is 0.70, indicating that 70% of the variation in the number of annual offenses can be predicted by the number of annual stops for drugs & weapons. 

### Let's do this again. Just drugs

In [None]:
# Get number of drug and weapons offenses per year

drugs = nyc_crime.loc[nyc_crime['offense'] == 'drugs']

annual_drugs = []
# iterate through each year
for year in nyc_crime.columns[3:12]:

    count = 0
    
    # iterate through each type of charge
    for offense in drugs[year]:
        count = count + offense
        
    annual_drugs = annual_drugs + [count]
        
annual_drugs

In [None]:
# Perform linear regression with number of stops each year as predictor var X and number of drug offenses as criterion var y

d_linear_model = LinearRegression().fit(annual_stops.reshape(-1,1), annual_drugs)

# print slope and intercept
print('Estimated slope: ', d_linear_model.coef_[0])
print('Estimated intercept: ',d_linear_model.intercept_) 

In [None]:
# Scatterplot of observed values and predicted values
plt.scatter(annual_stops, annual_drugs, label = 'Observed Values')

predicted_d = d_linear_model.predict(annual_stops.reshape(-1,1))
plt.scatter(annual_stops, predicted_d, label = 'Predicted Values')

plt.title('Annual Stops vs. Annual Offenses (Drug Charges)')
plt.xlabel('Annual Stops')
plt.ylabel('Annual Offenses')
plt.show()

In [None]:
print('r^2: {:.2f}'.format(d_linear_model.score(annual_stops.reshape(-1,1), annual_drugs)))

### Let's do this again. Just weapons

In [None]:
# Get number of drug and weapons offenses per year

weapons = nyc_crime.loc[nyc_crime['offense'] == 'weapons']

annual_weapons = []
# iterate through each year
for year in nyc_crime.columns[3:12]:

    count = 0
    
    # iterate through each type of charge
    for offense in weapons[year]:
        count = count + offense
        
    annual_weapons = annual_weapons + [count]
        
annual_weapons

In [None]:
# Perform linear regression with number of stops each year as predictor var X and number of WEAPONS offenses as criterion var y

weapons_linear_model = LinearRegression().fit(annual_stops.reshape(-1,1), annual_weapons)

# print slope and intercept
print('Estimated slope: ', weapons_linear_model.coef_[0])
print('Estimated intercept: ', weapons_linear_model.intercept_) 

In [None]:
# Scatterplot of observed values and predicted values
plt.scatter(annual_stops, annual_weapons, label = 'Observed Values')

predicted_weapons = weapons_linear_model.predict(annual_stops.reshape(-1,1))
plt.scatter(annual_stops, predicted_weapons, label = 'Predicted Values')

plt.title('Annual Stops vs. Annual Offenses (Weapons Charges)')
plt.xlabel('Annual Stops')
plt.ylabel('Annual Offenses')
plt.show()

In [None]:
print('r^2: {:.2f}'.format(weapons_linear_model.score(annual_stops.reshape(-1,1), annual_weapons)))

## Stacked Bar Graphs

In [3]:
nyc_crime.loc[(nyc_crime['status'] == 'major_felony')].iloc[:,1:12]['2009'].sum()

NameError: name 'nyc_crime' is not defined