In [1]:
import numpy as np
import pandas as pd
import json
import plotly.express as px
from datetime import datetime
from numba import jit
from modules import *
pd.options.plotting.backend = "plotly"

In [2]:
ARREST_FILE = 'data/arrest_numba_zipcode.csv'
CRIME_FILE = 'data/crime_numba_zipcode.csv'
INCOME_FILE = 'data/LAIncome.csv'
MOON_FILE = 'data/full_moon.csv'
DST_FILE = 'data/dst.csv'
RACE_FILE = 'data/LARace.csv'

In [3]:
%%time
arrest_data = arrest_clean(ARREST_FILE)
crime_data = crime_clean(CRIME_FILE)
income_data = income_clean(INCOME_FILE)
moon_dates = full_moon_finder(MOON_FILE)
race_data = race_clean(RACE_FILE)
was_dst_df = dst_clean(DST_FILE, crime_data['Crime Date'].unique())

CPU times: user 18.1 s, sys: 1.25 s, total: 19.3 s
Wall time: 19.3 s


### Hypothesis 1: Areas with a higher black population have higher arrests based on Homicide

In [4]:
homicide_data = pd.DataFrame(arrest_data[arrest_data['Charge Group Description'] == 'Homicide'].groupby(['ZipCode', 'Year']).size(), columns=['Homicide Arrests']).reset_index()

In [5]:
homicide_data = pd.DataFrame(homicide_data.groupby('ZipCode')['Homicide Arrests'].sum() / homicide_data.groupby('ZipCode')['Year'].count(), columns=['Homicide Arrests per Year'])

In [6]:
homicide_data

Unnamed: 0_level_0,Homicide Arrests per Year
ZipCode,Unnamed: 1_level_1
90001,2.166667
90002,4.400000
90003,20.900000
90004,1.666667
90005,1.875000
...,...
90710,1.600000
90731,3.444444
90732,1.250000
90744,3.700000


In [7]:
race_homicide = race_data.merge(homicide_data, how='left', left_on='Zip Code', right_on='ZipCode').dropna(subset=['Homicide Arrests per Year'])

In [8]:
race_homicide

Unnamed: 0,Zip Code,Total Population,White Alone Not Hispanic or Latino,Black or African American Alone,American Indian & Alaska Native Alone,Asian Alone,Native Hawaiian & Other Pacific Islander Alone,Some Other Race Alone,Population of Two or More Races,Hispanic or Latino,Homicide Arrests per Year
0,90001,57110,356,6133,410,145,27,27761,2125,50544,2.166667
1,90002,51223,330,13101,371,148,33,21239,1939,37598,4.400000
2,90003,66266,462,16181,516,149,58,26968,2516,49386,20.900000
3,90004,62180,10466,2187,497,16704,55,16754,2778,31987,1.666667
4,90005,37681,3039,2007,317,12740,40,9940,1684,19578,1.875000
...,...,...,...,...,...,...,...,...,...,...,...
67,90501,43180,11107,2301,281,10221,305,9216,2196,18175,1.000000
68,90710,25457,5319,3390,153,4614,222,4999,1422,11422,1.600000
69,90731,59662,19325,4369,721,2721,352,12731,3952,31732,3.444444
70,90732,21115,12142,931,149,1727,43,1609,1255,5602,1.250000


In [9]:
race_homicide['BlackPercent'] = (race_homicide['Black or African American Alone'] / race_homicide['Total Population'] )*100
race_homicide['Homicide Arrests per 100k'] = ( race_homicide['Homicide Arrests per Year'] / race_homicide['Total Population'] )*100000

In [34]:
race_homicide.plot.scatter(x='BlackPercent', y='Homicide Arrests per 100k', hover_name='Zip Code')

### Removing outliers

In [11]:
race_homicide = race_homicide[race_homicide['Homicide Arrests per 100k'] < 300]
race_homicide = race_homicide[race_homicide['BlackPercent'] < 50]

In [35]:
race_homicide.plot.scatter(x='BlackPercent', y='Homicide Arrests per 100k', hover_name='Zip Code', trendline='ols')

In [13]:
race_homicide['Homicide Arrests per 100k'].corr(race_homicide['BlackPercent'])

0.4323817474994251

### There is some relation between homicide arrest rate and black population percentage
### We fail to reject hypothesis 1

## Hypothesis 2: Zip Codes with higher median household income have a lower crime rate 

In [14]:
crime_zip = pd.DataFrame(crime_data.groupby(['ZipCode', 'Year']).size(), columns=['Crime']).reset_index()
crime_zip_all = pd.DataFrame(crime_zip.groupby('ZipCode')['Crime'].sum() / crime_zip.groupby('ZipCode')['Year'].count(), columns=['Total Crime per Year']).reset_index()
crime_zip_all

Unnamed: 0,ZipCode,Total Crime per Year
0,90001,629.500000
1,90002,2458.600000
2,90003,6143.600000
3,90004,1446.200000
4,90005,2393.400000
...,...,...
144,90813,4.166667
145,90814,4.200000
146,90815,2.000000
147,90822,1.250000


In [15]:
crime_income = crime_zip_all.merge(income_data, left_on='ZipCode', right_on='Zip').drop(['ZipCode'], axis=1)
crime_income

Unnamed: 0,Total Crime per Year,Zip,Amount
0,629.500000,90001,43360
1,2458.600000,90002,37285
2,6143.600000,90003,40598
3,1446.200000,90004,49675
4,2393.400000,90005,38491
...,...,...,...
135,1.000000,90808,110625
136,26.900000,90810,60227
137,4.166667,90813,38449
138,4.200000,90814,73391


In [16]:
crime_income.plot.scatter(x='Amount', y='Total Crime per Year', color_discrete_sequence=['#FF0000'],
                         labels={'Amount': 'Median Household Income'})

In [17]:
crime_income['Amount'].corr(crime_income['Total Crime per Year'])

0.21698890034549528

### No strong relation between median household income and crime rate  
### We reject hypothesis 2

## Hypothesis 3: Crime Rate for some crimes decreases more than 10% during daylight savings time

In [18]:
crime_data = crime_data.merge(was_dst_df, left_on='Crime Date', right_on='Date', how='left')

In [19]:
crime_data.groupby('WasDST')['DR Number'].nunique()

WasDST
False     687378
True     1305881
Name: DR Number, dtype: int64

In [20]:
crime_data.groupby('WasDST')['Crime Date'].nunique()

WasDST
False    1213
True     2247
Name: Crime Date, dtype: int64

In [21]:
pd.DataFrame(crime_data.groupby(['WasDST']).size() / crime_data.groupby('WasDST')['Crime Date'].nunique(), columns=['Daily Crime Rate']).plot.bar()

In [22]:
crime_rates_dst = pd.DataFrame(crime_data.groupby(['Crime Code Description', 'WasDST']).size() / crime_data.groupby('WasDST')['Crime Date'].nunique() , columns=['Crime rate per day']).reset_index()
crime_rates_dst = crime_rates_dst.pivot(columns='WasDST', values='Crime rate per day', index='Crime Code Description').reset_index()
crime_rates_dst.columns=['Crime Code Description', 'Standard Time', 'Daylight Time']
crime_rates_dst

Unnamed: 0,Crime Code Description,Standard Time,Daylight Time
0,ABORTION/ILLEGAL,0.001649,0.002225
1,ARSON,0.928277,0.963062
2,ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER,0.392415,0.474410
3,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",22.991756,26.230530
4,ATTEMPTED ROBBERY,3.190437,3.333778
...,...,...,...
135,VEHICLE - STOLEN,44.906843,43.235425
136,VIOLATION OF COURT ORDER,5.124485,5.732532
137,VIOLATION OF RESTRAINING ORDER,4.895301,5.344904
138,VIOLATION OF TEMPORARY RESTRAINING ORDER,0.360264,0.411215


In [23]:
crime_rates_dst = crime_rates_dst[(crime_rates_dst['Standard Time'] > 2) | (crime_rates_dst['Daylight Time'] > 2) ]

In [24]:
low_on_dst = crime_rates_dst[crime_rates_dst['Standard Time']*0.90 > crime_rates_dst['Daylight Time']]
low_on_dst

Unnamed: 0,Crime Code Description,Standard Time,Daylight Time
116,THEFT OF IDENTITY,37.905194,33.313752


In [25]:
plot_daylight_crime_rate(low_on_dst['Crime Code Description'].to_list(), crime_rates_dst)

### From above, crime rate decreses more than 10% during DST for Identity Theft crimes
### We fail to reject hypothesis 3

## Hypothesis 4: Crime Rate for some crimes increases more than 10% on a full moon night

In [26]:
crime_data['Full_Moon'] = crime_data['Crime Date'].apply(lambda x: x in moon_dates)

In [27]:
pd.DataFrame(crime_data.groupby(['Full_Moon']).size() / crime_data.groupby('Full_Moon')['Crime Date'].nunique(), columns=['Daily Crime Rate']).plot.bar()

In [28]:
crime_rates_moon = pd.DataFrame(crime_data.groupby(['Full_Moon', 'Crime Code Description',]).size() / crime_data.groupby('Full_Moon')['Crime Date'].nunique() , columns=['Crime rate per day']).reset_index()
crime_rates_moon  = crime_rates_moon.pivot(columns='Full_Moon', values='Crime rate per day', index='Crime Code Description').reset_index()
crime_rates_moon.columns=['Crime Code Description', 'Non_Full_Moon', 'Full_Moon']
crime_rates_moon

Unnamed: 0,Crime Code Description,Non_Full_Moon,Full_Moon
0,ABORTION/ILLEGAL,0.002094,
1,ARSON,0.952139,0.914530
2,ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER,0.447801,0.384615
3,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",25.133712,23.991453
4,ATTEMPTED ROBBERY,3.280586,3.367521
...,...,...,...
135,VEHICLE - STOLEN,43.838468,43.333333
136,VIOLATION OF COURT ORDER,5.522884,5.418803
137,VIOLATION OF RESTRAINING ORDER,5.195334,4.957265
138,VIOLATION OF TEMPORARY RESTRAINING ORDER,0.390966,0.461538


In [29]:
crime_rates_moon = crime_rates_moon[(crime_rates_moon['Non_Full_Moon'] > 2) | (crime_rates_moon['Full_Moon'] > 2) ]

In [30]:
high_on_fm = crime_rates_moon[crime_rates_moon['Full_Moon'] > 1.1*crime_rates_moon['Non_Full_Moon']]
high_on_fm

Unnamed: 0,Crime Code Description,Non_Full_Moon,Full_Moon


### From above, crime rate for no crimes increases more than 10% on a full moon night
### Hence, we reject hypothesis 4