In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [16,10]

import datetime

import seaborn as sns
import folium


In [None]:
df = pd.read_csv('crime.csv', encoding = 'latin-1')

In [None]:
df.head()

We will keep only the UCR = 'Part One' type crimes, which are the most serious offenses. This will narrow it down from 67 different kinds of crimes down to 9

In [None]:
df = df.loc[df['UCR_PART'] == 'Part One']
#drop unneccessary columns
df = df.drop(['OFFENSE_CODE','UCR_PART','Location'], axis=1)

In [None]:
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE']) #changing to datetime variable

In [None]:
df.info()

District, shooting, street, lat, and long columns all have nan values, lets take care of those

In [None]:
df.SHOOTING.fillna('N', inplace = True) #most crimes dont involve shootings, so set it as N for 'no'

In [None]:
# Convert DAY_OF_WEEK to an ordered category
df.DAY_OF_WEEK = df['OCCURRED_ON_DATE'].dt.dayofweek

In [None]:
# Replace -1 values in Lat/Long with Nan
df.Lat.replace(-1, None, inplace=True)
df.Long.replace(-1, None, inplace=True)

In [None]:
# Rename columns to something easier to type
rename = {'INCIDENT_NUMBER': 'id',
         'OFFENSE_CODE_GROUP':'Group',
         'OFFENSE_DESCRIPTION':'Description',
         'DISTRICT':'District',
         'REPORTING_AREA':'Area',
         'SHOOTING':'Shooting',
         'OCCURRED_ON_DATE':'Date',
         'YEAR':'Year',
         'MONTH':'Month',
         'DAY_OF_WEEK':'Day',
         'HOUR':'Hour',
         'STREET':'Street'}
df.rename(index=str, columns=rename, inplace=True)

We don't really care too much about which street the crime occurs on. There are too many different streets in the entirerity of Boston, and this general location information is already somehwat encoded in other variables such as lat, lon, and district. We will not worry about filling out the nan values for that column. 

In [None]:
df.isnull().sum()

If anything, since we aren't too concerend with modeling some behavior, and since we have more than 61 thousand entries, we can just drop all of the entries where there are null values

In [None]:
df = df[df['Lat'].notnull()]

In [None]:
df = df[df['District'].notnull()]

In [None]:
df = df.drop('Street', axis = 1)

In [None]:
df.info()

We still end up with 58.8k fully filled entries for the crimes in Boston from June 14, 2015 to September 3, 2018.

In [None]:
# Countplot for crime types
sns.catplot(y='Group',
           kind='count',
            height=8, 
            aspect=1.5,
            order=df.Group.value_counts().index,
           data=df)

Larceny seems to be the most common type of serious crime occuring in Boston

In [None]:
fig, ax = plt.subplots(ncols=3, sharey=True)
ax[0].plot(df.groupby('Hour').count()['id'], 'bo-', lw=2, alpha=0.7)
ax[1].plot(df.groupby('Day').count()['id'], 'go-', lw=2, alpha=0.7)
ax[2].plot(df.groupby('Month').count()['id'], 'ro-', lw=2, alpha=0.7)
ax[0].set_xlabel('Hour of Day')
ax[1].set_xlabel('Day of Week')
ax[2].set_xlabel('Month of Year')
ax[0].set_ylabel('Number of Rides')
fig.suptitle('Number of Rides over Hour/Day/Month increments')
plt.show()

During the day, the lowest number of crimes occur at 5am, and gradually peak at near 6pm. During the week, the highest number of crimes occur on Friday. As for the month, the highest number of crimes peak during July. 

### Visualizing the crimes with a scatter plot

In [None]:
sns.scatterplot(x='Lat',
               y='Long',
                hue='District',
                alpha=0.01,
               data=df)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

The highest number of crimes seems to happen in districts A1 and D4, which are the most crowded downtown areas of Boston. There is also an unusually high concentration of crimes occuring in district D14

### Let's visualize the history of the crimes happening throughout the day in Boston using folium heatmaps

In [None]:
df['Hour'] = df['Date'].dt.hour

heat_df = df.sample(n= 5000) #take only random sample of 5k

#extract required columns
heat_df = heat_df[['Lat','Long', 'Hour']]

#handle the columns as floats
heat_df['Lat'] = df['Lat'].astype(float)
heat_df['Long'] = df['Long'].astype(float)

#create weight column, using the date
heat_df['Weight'] = heat_df['Hour']
heat_df['Weight'] = heat_df['Weight'].astype('float')
heat_df = heat_df.dropna(axis= 0 , subset = ['Lat','Long', 'Weight'])

In [None]:
import folium.plugins as plugins
boston_heatmap = folium.Map(location = [42.3125,-71.0875], tiles = 'Stamen Terrain', zoom_start =12)

#list comprehension to make out list of lists
heat_data = [[[row['Lat'], row['Long']]
             for index, row in heat_df[heat_df['Weight']== i].iterrows()]
                for i in range(0,24)] #0 to 24 for each hour of they day

#plot it on the map
hm = plugins.HeatMapWithTime(heat_data, auto_play = True, max_opacity = 0.9)
hm.add_to(boston_heatmap)

#display the map
boston_heatmap

- Larceny is the most common type of serious crime.
- Serious crimes are most likely to occur in the afternoon and evening.
- Serious crimes are most likely to occur on Friday and least likely to occur on Sunday.
- Serious crimes are most likely to occur in the summer and early fall, and least likely to occur in the winter (with the exeption of January, which has a crime rate more similar to the summer).
- Serious crimes are most common in the city center, especially districts A1 and D4.

These observations only pertain to the "serious" crimes categorized under part one of the UCR codes. Part two and Part three may ascertain different results (obviously shoplifting is going to occur under different scenarios than larceny)

## Crime Time Series

**Let's look at the year 2017 (because we have the full year on record) and see how the crime is trending**

In [None]:
df2 =  df[df['Year'] == 2017]

In [None]:
df2.loc[:,'dayofyear'] = df2.loc[:,'Date'].dt.dayofyear

In [None]:
dailycrimes= pd.DataFrame(df2.groupby(['District', 'dayofyear']).count()['id'])
dailycrimes['unit'] = 1
dailycrimes.reset_index(inplace =True)
sns.tsplot(data=dailycrimes, time  = 'dayofyear', unit = 'unit', condition = 'District', value= 'id')

When trying to plot all of the different districts as a time series representing the number of crimes occuring over the course of 2017, its hard to visualize the trend, let's just sum them all together, and create one big crime time series for Boston. On the other hand, you could just single out one district, or merge a few districts to see their own trends as well. 

In [None]:
plt.plot(df2.groupby('dayofyear').count()['id'], 'bo-', lw=2, alpha=0.7)
plt.xlabel('Day of Year')
plt.ylabel('Number of Rides')
plt.title('Number of Rides over the Year')

In [None]:
count =  df2.groupby('Date').count()['id']
series = pd.DataFrame(({'day': count.index, 'count': count.values})).set_index('day')
series.head()

In [None]:
series = series.resample('1D').sum()
series.head()

Let's try to see if we can identify some sort of seasonality in the plot above

In [None]:
## ETS Graph (Error Trend Seasonality)

from statsmodels.tsa.seasonal import seasonal_decompose

result_ets = seasonal_decompose(series['count'],model='add')
result_ets.plot();

While the over trend is hard to narrow down, there seems to be a strong weekly seasonality going on. 
Fit a polynomial of average weekly trend, subtrac it from the original dataset, and model

### Imbalanced Dataset, predicting Shootings being invovled with crimes in Boston

In [None]:
df.Shooting.value_counts()