In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from datetime import datetime
from collections import Counter

In [2]:
df = pd.read_csv('Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
print(df.shape)
df.head()

(1048575, 13)


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,1/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006000000000.0
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,1/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009800000000.0


## Sanity check

### 1. Check if there are duplicates in incident_num

To-dos:
* To calculate total number of crimes, make sure not to double count
* Double counting is alright if calculating based on crime types but delete if both incident_num and category are the same

In [3]:
Counter(df['IncidntNum'])

Counter({150060275: 1,
         150098210: 3,
         150098226: 1,
         150098232: 1,
         150098248: 2,
         150098254: 1,
         150098260: 1,
         150098345: 4,
         150098367: 1,
         150098373: 2,
         150098389: 1,
         150098395: 1,
         150098408: 1,
         150098414: 4,
         160919032: 1,
         150098420: 3,
         150098436: 1,
         150098442: 1,
         150098458: 3,
         140009459: 2,
         150098464: 1,
         150098470: 1,
         150098486: 1,
         150098492: 2,
         150098505: 1,
         150098511: 1,
         150098527: 3,
         150098533: 1,
         150098549: 1,
         150098555: 1,
         150098561: 1,
         150098577: 1,
         150098583: 1,
         150098599: 1,
         150098602: 3,
         150098618: 1,
         150098624: 1,
         150098630: 1,
         150098652: 2,
         150098668: 1,
         150098674: 3,
         150098680: 1,
         150098696: 1,
         15

In [4]:
'''check random incident_num with non-1s count to see reason for duplicates
    if incident_num and category are the same, then remove duplicates (later)
'''
df[df['IncidntNum']==150098345]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
10,150098345,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,Sunday,2/1/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)",15009800000000.0
11,150098345,DRUG/NARCOTIC,POSSESSION OF METH-AMPHETAMINE,Sunday,2/1/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)",15009800000000.0
12,150098345,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,Sunday,2/1/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)",15009800000000.0
13,150098345,WARRANTS,WARRANT ARREST,Sunday,2/1/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)",15009800000000.0


### 2. Check the unique category of crimes
To-dos:
* Group similar crimes together

In [5]:
# check set of crimes
print(len(set(df['Category'])))
set(df['Category'])

39


{'ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SECONDARY CODES',
 'SEX OFFENSES, FORCIBLE',
 'SEX OFFENSES, NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TREA',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS'}

## 1. Reduce number of crime categories

Group the criminal activities into the following categories:
1. theft
2. public disturbance
3. drug
4. sex
5. suicide
6. kidnap
7. domestic
8. other
9. non-criminal (can delete)

In [6]:
theft = ['BRIBERY',
        'BURGLARY',
        'EMBEZZLEMENT',
        'EXTORTION',
        'FORGERY/COUNTERFEITING',
        'FRAUD',
        'GAMBLING',
        'LARCENY/THEFT',
        'RECOVERED VEHICLE',
        'ROBBERY',
        'TREA',
        'VEHICLE THEFT',
        'STOLEN PROPERTY',
        'ARSON']

public = ['TRESPASS',
        'VANDALISM',
        'WARRANTS',
        'LOITERING',
        'ASSAULT',
        'BAD CHECKS',
        'DISORDERLY CONDUCT',
        'DRIVING UNDER THE INFLUENCE',
        'DRUNKENNESS',
        'SUSPICIOUS OCC',
         'LIQUOR LAWS',
         'WEAPON LAWS']

drug =  ['DRUG/NARCOTIC']

sex = ['PORNOGRAPHY/OBSCENE MAT',
        'PROSTITUTION',
        'SEX OFFENSES, FORCIBLE',
        'SEX OFFENSES, NON FORCIBLE']

suicide = ['SUICIDE']

kidnap = ['KIDNAPPING']

domestic = ['MISSING PERSON',
            'RUNAWAY',
            'FAMILY OFFENSES']

others = ['OTHER OFFENSES',
        'SECONDARY CODES']

non_criminal = ['NON-CRIMINAL']

In [7]:
replace_dict = {
                "theft": theft,
                "public": public,
                "drug": drug,
                "sex": sex,
                "suicide": suicide,
                "kidnap":kidnap,
                "domestic": domestic,
                "others": others,
                "non_criminal": non_criminal
               }

In [8]:
new_cat = df['Category']

for key, val in replace_dict.items():
    new_cat = new_cat.replace(val, key)

df["new_cat"] = new_cat

In [9]:
set(df["new_cat"])

{'domestic',
 'drug',
 'kidnap',
 'non_criminal',
 'others',
 'public',
 'sex',
 'suicide',
 'theft'}

In [10]:
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId,new_cat
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,1/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006000000000.0,non_criminal
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0,theft
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0,public
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,2/1/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009800000000.0,others
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,1/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009800000000.0,public


In [11]:
# drop redundant columns
df.drop(columns=['Category','Descript','Time','Resolution','Address','X','Y','Location','PdId'], inplace=True)

In [12]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,PdDistrict,new_cat
0,150060275,Monday,1/19/2015,MISSION,non_criminal
1,150098210,Sunday,2/1/2015,TENDERLOIN,theft
2,150098210,Sunday,2/1/2015,TENDERLOIN,public
3,150098210,Sunday,2/1/2015,TENDERLOIN,others
4,150098226,Tuesday,1/27/2015,NORTHERN,public


In [14]:
df.shape

(1048575, 5)

In [15]:
# remove non_criminal
df = df[df['new_cat']!='non_criminal']
df.shape

(934679, 5)

## 2. Select data between 2012-2017

In [22]:
df['year'] = df['Date'].map(lambda x: pd.to_datetime(x).year)

In [35]:
set(df['year'])

{2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017}

In [36]:
df = df[(df['year']>=2012) & (df['year']<=2017)]

In [37]:
set(df['year'])

{2012, 2013, 2014, 2015, 2016, 2017}

## 3. Count number of unique crimes each day

In [38]:
dropdup = df.drop_duplicates(subset='IncidntNum', keep="last")
dropdup.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,PdDistrict,new_cat,year
3,150098210,Sunday,2/1/2015,TENDERLOIN,others,2015
4,150098226,Tuesday,1/27/2015,NORTHERN,public,2015
7,150098248,Saturday,1/31/2015,BAYVIEW,public,2015
8,150098254,Saturday,1/31/2015,CENTRAL,theft,2015
9,150098260,Saturday,1/31/2015,CENTRAL,theft,2015


In [40]:
dropdup.shape

(273421, 6)

In [55]:
crime_df = dropdup.groupby(['Date', 'DayOfWeek'])['IncidntNum'].agg(['count']).reset_index()
np.sum(crime_df['count'].values)    # sanity check

273421

In [59]:
crime_df.rename(columns={"count": "num_unique_crimes"}, inplace=True)
crime_df.head()

Unnamed: 0,Date,DayOfWeek,num_unique_crimes
0,1/1/2012,Sunday,212
1,1/1/2013,Tuesday,233
2,1/1/2014,Wednesday,311
3,1/1/2015,Thursday,133
4,1/1/2016,Friday,1


## 4. Count number of each crime types

In [60]:
pd.get_dummies(df, columns=['PdDistrict','new_cat'])

Unnamed: 0,IncidntNum,DayOfWeek,Date,year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,...,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,new_cat_domestic,new_cat_drug,new_cat_kidnap,new_cat_others,new_cat_public,new_cat_sex,new_cat_suicide,new_cat_theft
1,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,150098226,Tuesday,1/27/2015,2015,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
6,150098248,Saturday,1/31/2015,2015,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,150098248,Saturday,1/31/2015,2015,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,150098254,Saturday,1/31/2015,2015,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,150098260,Saturday,1/31/2015,2015,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10,150098345,Sunday,2/1/2015,2015,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
11,150098345,Sunday,2/1/2015,2015,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
