# SF crime analysis by police district, using plotly

Focus on crime analysis by police district, using the interactive plotting tools, plotly. Also did some time analysis.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
import plotly
import plotly.tools as tls
import plotly.plotly as py
from plotly.graph_objs import Bar, Scatter, Marker, Layout 

In [9]:
py.sign_in('lingcheng99', '39woytnfor')
tls.set_credentials_file(username='lingcheng99', api_key='39woytnfor')

In [3]:
train = pd.read_csv('train.csv')
train.shape

(878049, 9)

In [4]:
train['Dates'] = pd.to_datetime(train['Dates'])
train['Dates'].describe()

count                  878049
unique                 389257
top       2011-01-01 00:01:00
freq                      185
first     2003-01-06 00:01:00
last      2015-05-13 23:53:00
Name: Dates, dtype: object

In [5]:
train['year'] = train['Dates'].dt.year
train['month'] = train['Dates'].dt.month
train['day'] = train['Dates'].dt.day
train['dayofweek'] = train['Dates'].dt.dayofweek
train['hour'] = train['Dates'].dt.hour

In [6]:
crime_counts = pd.Series(train['Category'].value_counts())
crime_counts

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [7]:
top10_crime = crime_counts.index[:10]
train_subset = train[train['Category'].isin(top10_crime)]
print train_subset.shape
train_subset['Category'].value_counts()

(733122, 14)


LARCENY/THEFT     174900
OTHER OFFENSES    126182
NON-CRIMINAL       92304
ASSAULT            76876
DRUG/NARCOTIC      53971
VEHICLE THEFT      53781
VANDALISM          44725
WARRANTS           42214
BURGLARY           36755
SUSPICIOUS OCC     31414
Name: Category, dtype: int64

# Interactive barplot of crime counts by category

Plotly is a great tool for interactive plotting. It is helpful to have both visual bars and numbers at the same time.

In [10]:
data = [Bar(x=crime_counts.index, y=crime_counts.values)]
layout = Layout(xaxis=dict(tickangle=-45, tickfont=dict(size=10)),margin={'b': 150},
               title='Counts of crime by Category')
py.iplot({'data': data, 'layout':layout},filename='sfcrime_1')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~lingcheng99/0 or inside your plot.ly account where it is named 'sfcrime_1'


In [11]:
tls.embed("https://plot.ly/~lingcheng99/0")

I like these interactive barplotting, as I can see number and visual at the same time; value_counts() list numbers but not visuals, while regular barplot is hard to read

# Barplot of crime in each district

In [28]:
district_counts = pd.Series(train['PdDistrict'].value_counts())
data = [Bar(x=district_counts.index, y=district_counts.values)]
layout = Layout(margin={'b': 150},
               title='Counts of Crime by Police District')
iplot({'data': data, 'layout':layout})

# Stacked barplot of crime category and police district

In [68]:
districts = list(train.PdDistrict.unique())
traces = [] # the series in the graph - one trace for each city

for district in districts:
    t1 = train[train['PdDistrict']==district]
    t2 = t1.groupby('Category').size().reset_index()
    traces.append(Bar(x=t2['Category'], y=t2[0], name=district))

layout = Layout(barmode='stack', xaxis={'tickangle': 40}, margin={'b': 150},
               title='Crime counts by district')
iplot({'data': traces, 'layout': layout})

In [67]:
#normalize crime counts
for trace in traces:
    trace['y'] = 100.*trace['y']/sum(trace['y'])

iplot({'data': traces, 
          'layout': Layout(
                barmode='group',
                xaxis={'tickangle': 40, 'autorange': False, 'range': [-0.5, 16]},
                yaxis={'title': 'Percent of crime by district'},
                margin={'b': 150},
                title='Relative Number of crimes by district')
         })

In [65]:
crimes = list(train.Category.unique())
traces = [] # the series in the graph - one trace for each city

for crime in crimes:
    t1 = train[train['Category']==crime]
    t2 = t1.groupby('PdDistrict').size().reset_index()
    traces.append(Bar(x=t2['PdDistrict'], y=t2[0], name=crime))
    
layout = Layout(barmode='stack', xaxis={'tickangle': 40}, margin={'b': 150},
               title='All crime counts in each police district')
iplot({'data': traces, 'layout': layout})

In [64]:
#use top10 crimes to simplify stacked barplot for each district
crimes = list(train_subset.Category.unique())
traces = [] # the series in the graph - one trace for each city

for crime in crimes:
    t1 = train_subset[train_subset['Category']==crime]
    t2 = t1.groupby('PdDistrict').size().reset_index()
    traces.append(Bar(x=t2['PdDistrict'], y=t2[0], name=crime))
    
layout = Layout(barmode='stack', xaxis={'tickangle': 40}, margin={'b': 150},
               title='Top 10 crime counts in each police district')
iplot({'data': traces, 'layout': layout})

In [73]:
#normalize for each each district
crimes = list(train_subset.Category.unique())
traces = [] # the series in the graph - one trace for each city



for crime in crimes:
    t1 = train_subset[train_subset['Category']==crime]
    t2 = t1.groupby('PdDistrict').size().reset_index()
    traces.append(Bar(x=t2['PdDistrict'], y=t2[0], name=crime))
    
for trace in traces:
    trace['y'] = 100.*trace['y']/sum(trace['y'])

iplot({'data': traces, 
          'layout': Layout(
                barmode='group',
                xaxis={'tickangle': 40, 'autorange': False, 'range': [-0.5, 16]},
                yaxis={'title': 'Percent of crime by district'},
                margin={'b': 150},
                title='Relative Number of top 10 crimes by district')
         })
    

# Stacked barplot of crime by hour

In [48]:
#function to plot top10_crimes by stacked barplot, over hour or dayofweek or month
#df is dataframe, col is hour/dayofweek/month, target_range is range (1,24) 
#add normalized parameter 

def plotly_crime(df,col,target_range, normalized=False):
    crime_traces = {}
    crime_traces['total'] = {} #total to store sum for normalization
    for item in target_range:
        t1 = df[df[col]==item]
        t2 = t1.groupby('Category').size().reset_index().sort_values(0,ascending=False)
        crime_traces['total'][item] = sum(t2[0]) #0 is the column with counts
    
        for i in range(10):
            crime = t2.get_value(i,'Category')
            if normalized:
                count = t2.get_value(i,0)/float(crime_traces['total'][item])
            else:
                count = t2.get_value(i,0)
            if crime in crime_traces:
                crime_traces[crime][item] = count
            else:
                crime_traces[crime] = {item:count}
    
    traces = []
    for crime in crime_traces:
        if crime!='total':
            traces.append({
                    'x':target_range,
                    'y':[crime_traces[crime].get(i,None) for i in target_range],
                    'name':crime,
                    'type':'bar'
                }
            )
    
    return traces

In [49]:
hour_traces = plotly_crime(train_subset,'hour',range(1,24))

In [50]:
iplot({
    'data': hour_traces, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Hour in Day'},
        'yaxis': {'title': 'Number of crimes'},
        'title': 'The 10 Most Common crimes by Hour in a Day'
    }})

In [51]:
hour_traces2 = plotly_crime(train_subset,'hour',range(1,24),normalized=True)
iplot({
    'data': hour_traces2, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Hour in Day'},
        'yaxis': {'title': 'Percentage of crimes'},
        'title': 'The 10 Most Common crimes in percentage by Hour in a Day'
    }})

# Stacked barplot of crime by dayofweek

In [52]:
week_traces = plotly_crime(train_subset,'dayofweek',range(7))
iplot({
    'data': week_traces, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Day of Week'},
        'yaxis': {'title': 'Number of crimes'},
        'title': 'The 10 Most Common crimes by day of week'
    }})

In [53]:
week_traces2 = plotly_crime(train_subset,'dayofweek',range(7),normalized=True)
iplot({
    'data': week_traces2, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Day of Week'},
        'yaxis': {'title': 'Percentage of crimes'},
        'title': 'The 10 Most Common crimes in percentage by day of week'
    }})

# Stacked barplot of crime by month

In [55]:
month_traces = plotly_crime(train_subset,'month',range(1,13))

iplot({
    'data': month_traces, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Month'},
        'yaxis': {'title': 'Number of crimes'},
        'title': 'The 10 Most Common crimes by month'
    }})

In [57]:
month_traces2 = plotly_crime(train_subset,'month',range(1,13),normalized=True)

iplot({
    'data': month_traces2, 
    'layout': {
        'barmode': 'stack',
        'xaxis': {'title': 'Month'},
        'yaxis': {'title': 'Percentage of crimes'},
        'title': 'The 10 Most Common crimes in percentage by day of week'
    }})