# Importing libraries and Dataset

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from datetime import datetime
from collections import Counter

In [133]:
df = pd.read_csv('data/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
print(df.shape)
df.head()

(2215024, 13)


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160


## 1. Inspect data for any duplicates

To-dos:
* To calculate total number of crimes, make sure not to double count
* Double counting is alright if calculating based on crime types but delete if both incident_num and category are the same

From the metadata, we understand that `PdId` refers to the unique identifier for use in update and insert operations for the dataset, hence we can drop the column.

In [134]:
df=df.drop(['PdId'], axis=1)

In [135]:
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)"
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)"


In [136]:
# check unique no. of rows (corresponding to no. of incident numbers)
print("No. of rows:",df.shape[0])

No. of rows: 2215024


In [137]:
# check unique no. of incidents
print("No. of unique incidents:", df['IncidntNum'].nunique())

No. of unique incidents: 1746914


In [138]:
df.shape[0]-df['IncidntNum'].nunique()

468110

Hence we can conclude that there were 468110 duplicates in incident_num. Let's take a look at an example of a duplicate incident. The logic is that, if the duplicates arise due to the incident being categorized into various incidents of crime, we can retain the double-counting; however, if the duplicates are merely a result of same incident_num and category, we should delete these rows.

In [139]:
incidentNumList=Counter(df['IncidntNum']) # obtain dictionary of all unique key-value pairs 

In [140]:
incidentNumList

Counter({150060275: 1,
         150098210: 3,
         150098226: 1,
         150098232: 1,
         150098248: 2,
         150098254: 1,
         150098260: 1,
         150098345: 4,
         150098367: 1,
         150098373: 2,
         150098389: 1,
         150098395: 1,
         150098408: 1,
         150098414: 4,
         160919032: 1,
         150098420: 3,
         150098436: 1,
         150098442: 1,
         150098458: 3,
         140009459: 2,
         150098464: 1,
         150098470: 1,
         150098486: 1,
         150098492: 2,
         150098505: 1,
         150098511: 1,
         150098527: 3,
         150098533: 1,
         150098549: 1,
         150098555: 1,
         150098561: 1,
         150098577: 1,
         150098583: 1,
         150098599: 1,
         150098602: 3,
         150098618: 1,
         150098624: 1,
         150098630: 1,
         150098652: 2,
         150098668: 1,
         150098674: 3,
         150098680: 1,
         150098696: 1,
         15

In [141]:
allIncidentNumbers=df['IncidntNum'].unique().tolist()

In [142]:
len(incidentNumList) 

1746914

In [143]:
threshold=1
duplicateIncidentList=[]
isolatedIncidentList=[]
for incident, incidentCount in incidentNumList.items():
    if incidentCount>threshold:
        duplicateIncidentList.append(incident)
    elif incidentCount==threshold:
        isolatedIncidentList.append(incident)

# shorter code is this: 
# isolatedIncidents=[incd for incd, incidentCount in incidentNumList.items() if incidentCount == threshold]
# duplicateIncidents=[incd for incd, incidentCount in incidentNumList.items() if incidentCount > threshold]
# sumIsolates=len(isolatedIncidents)
# sumDuplicate=len(duplicateIncidents)

In [144]:
len(duplicateIncidentList)

341334

In [145]:
len(isolatedIncidentList)

1405580

In [146]:
'''check random incident_num with non-1s count to see reason for duplicates
    if incident_num and category are the same, then remove duplicates (later)
'''
df[df['IncidntNum']==150098345]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
10,150098345,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
11,150098345,DRUG/NARCOTIC,POSSESSION OF METH-AMPHETAMINE,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
12,150098345,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
13,150098345,WARRANTS,WARRANT ARREST,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"


In [147]:
df[df['IncidntNum']==150098210]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"


In [148]:
# check the original dataframe against 'duplicateIncidentList'
# when 'duplicateIncidentList' matches df['IncidntNum'], then check if the category is the same
# if the categories are the same, retain one row and drop the others
# if the categories are different, retain all rows 

df_duplicates=df[df['IncidntNum'].isin(allIncidentNumbers)].sort_values(by=['IncidntNum'])
df_duplicates_cleaned = df_duplicates.drop_duplicates(subset=['Category','IncidntNum'], keep="first")
df_duplicates_cleaned # this is the final dataframe without duplicates


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
493897,3979,WARRANTS,WARRANT ARREST,Thursday,12/09/2004,16:22,BAYVIEW,"ARREST, BOOKED",INGALLS ST / HUDSON AV,-122.379598,37.732467,"(37.7324666541275, -122.379598260097)"
1209080,10128,WARRANTS,WARRANT ARREST,Sunday,12/18/2005,22:20,INGLESIDE,"ARREST, BOOKED",ELLSWORTH ST / ALEMANY BL,-122.418969,37.732208,"(37.7322083102079, -122.4189687006)"
1490233,10736,WARRANTS,WARRANT ARREST,Thursday,07/15/2004,10:18,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)"
1393001,38261,WARRANTS,WARRANT ARREST,Thursday,04/17/2003,22:45,NORTHERN,"ARREST, BOOKED",POLK ST / SUTTER ST,-122.420120,37.787757,"(37.7877570602182, -122.420120319211)"
377806,52205,WARRANTS,WARRANT ARREST,Thursday,02/06/2003,07:20,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)"
1333777,61397,WARRANTS,WARRANT ARREST,Sunday,06/29/2003,10:00,TENDERLOIN,"ARREST, BOOKED",100 Block of TURK ST,-122.411593,37.783053,"(37.7830529510782, -122.411593095704)"
596484,62389,VEHICLE THEFT,STOLEN MOTORCYCLE,Tuesday,03/30/2004,11:35,MISSION,NONE,2600 Block of BRYANT ST,-122.409121,37.751979,"(37.7519787472165, -122.409121399225)"
391902,62953,WARRANTS,WARRANT ARREST,Sunday,08/22/2004,18:36,SOUTHERN,"ARREST, BOOKED",COLINPKELLYJR ST / TOWNSEND ST,-122.389758,37.781221,"(37.781221050395, -122.389757937508)"
1485730,70796,WARRANTS,ENROUTE TO OUTSIDE JURISDICTION,Thursday,07/13/2006,08:45,SOUTHERN,"ARREST, BOOKED",900 Block of MISSION ST,-122.407933,37.781506,"(37.7815063666429, -122.407932868203)"
864778,73057,WARRANTS,WARRANT ARREST,Thursday,08/26/2004,12:00,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)"


In [149]:
df=df_duplicates_cleaned # final dataframe
print("No. of rows and columns of final dataframe: ",df.shape)

No. of rows and columns of final dataframe:  (2057667, 12)


## 2. Check the unique category of crimes
To-dos:
* Group similar crimes together

In [150]:
# check set of crimes
print(len(set(df['Category'])))
set(df['Category'])

39


{'ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SECONDARY CODES',
 'SEX OFFENSES, FORCIBLE',
 'SEX OFFENSES, NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TREA',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS'}

## 1. Reduce number of crime categories

Group the criminal activities into the following categories:
1. theft
2. public disturbance
3. drug
4. sex
5. suicide
6. kidnap
7. domestic
8. other
9. non-criminal (can delete)

In [151]:
theft = ['BRIBERY',
        'BURGLARY',
        'EXTORTION',        
        'GAMBLING',
        'LARCENY/THEFT',
        'RECOVERED VEHICLE',
        'ROBBERY',
        'TREA',
        'VEHICLE THEFT',
        'STOLEN PROPERTY',
        'ARSON']

fraud = ['FRAUD',
        'EMBEZZLEMENT',
        'FORGERY/COUNTERFEITING']

public = ['TRESPASS',
        'VANDALISM',
        'WARRANTS',
        'LOITERING',
        'ASSAULT',
        'BAD CHECKS',
        'DISORDERLY CONDUCT',
        'DRIVING UNDER THE INFLUENCE',
        'DRUNKENNESS',
        'SUSPICIOUS OCC',
         'LIQUOR LAWS',
         'WEAPON LAWS']

drug =  ['DRUG/NARCOTIC']

sex = ['PORNOGRAPHY/OBSCENE MAT',
        'PROSTITUTION',
        'SEX OFFENSES, FORCIBLE',
        'SEX OFFENSES, NON FORCIBLE']

suicide = ['SUICIDE']

kidnap = ['KIDNAPPING']

domestic = ['MISSING PERSON',
            'RUNAWAY',
            'FAMILY OFFENSES']

others = ['OTHER OFFENSES',
        'SECONDARY CODES']

non_criminal = ['NON-CRIMINAL']

In [152]:
replace_dict = {
                "theft": theft,
                "fraud": fraud,
                "public": public,
                "drug": drug,
                "sex": sex,
                "suicide": suicide,
                "kidnap":kidnap,
                "domestic": domestic,
                "others": others,
                "non_criminal": non_criminal
               }

In [153]:
new_cat = df['Category']

for key, val in replace_dict.items():
    new_cat = new_cat.replace(val, key)

df["New Category"] = new_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [154]:
set(df["New Category"])

{'domestic',
 'drug',
 'fraud',
 'kidnap',
 'non_criminal',
 'others',
 'public',
 'sex',
 'suicide',
 'theft'}

In [155]:
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,New Category
493897,3979,WARRANTS,WARRANT ARREST,Thursday,12/09/2004,16:22,BAYVIEW,"ARREST, BOOKED",INGALLS ST / HUDSON AV,-122.379598,37.732467,"(37.7324666541275, -122.379598260097)",public
1209080,10128,WARRANTS,WARRANT ARREST,Sunday,12/18/2005,22:20,INGLESIDE,"ARREST, BOOKED",ELLSWORTH ST / ALEMANY BL,-122.418969,37.732208,"(37.7322083102079, -122.4189687006)",public
1490233,10736,WARRANTS,WARRANT ARREST,Thursday,07/15/2004,10:18,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)",public
1393001,38261,WARRANTS,WARRANT ARREST,Thursday,04/17/2003,22:45,NORTHERN,"ARREST, BOOKED",POLK ST / SUTTER ST,-122.42012,37.787757,"(37.7877570602182, -122.420120319211)",public
377806,52205,WARRANTS,WARRANT ARREST,Thursday,02/06/2003,07:20,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)",public


In [132]:
# drop redundant columns
df.drop(columns=['Category','Descript','Resolution','Address','X','Y','Location'], inplace=True)
# keep time for now because we will need it to combine it with the sunset/sunrise timings

In [97]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,PdDistrict,new_cat
493897,3979,Thursday,12/09/2004,BAYVIEW,public
1209080,10128,Sunday,12/18/2005,INGLESIDE,public
1490233,10736,Thursday,07/15/2004,SOUTHERN,public
1393001,38261,Thursday,04/17/2003,NORTHERN,public
377806,52205,Thursday,02/06/2003,SOUTHERN,public


In [156]:
df.shape

(2057667, 13)

In [158]:
# remove non_criminal
df = df[df["New Category"]!='non_criminal']
df.shape

(1823419, 13)

## 2. Select data between 2012-2017

In [160]:
df['year'] = df['Date'].map(lambda x: pd.to_datetime(x).year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [161]:
set(df['year'])

{2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018}

In [162]:
df = df[(df['year']>=2012) & (df['year']<=2017)]

In [163]:
set(df['year'])

{2012, 2013, 2014, 2015, 2016, 2017}

In [164]:
# no. of rows of data for 2012-2017
df.shape[0]

733179

In [166]:
df.to_csv ('SFCrime_2012-2017.csv', index = None, header=True)

## 3. Count number of unique crimes each day

In [38]:
dropdup = df.drop_duplicates(subset='IncidntNum', keep="last")
dropdup.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,PdDistrict,new_cat,year
3,150098210,Sunday,2/1/2015,TENDERLOIN,others,2015
4,150098226,Tuesday,1/27/2015,NORTHERN,public,2015
7,150098248,Saturday,1/31/2015,BAYVIEW,public,2015
8,150098254,Saturday,1/31/2015,CENTRAL,theft,2015
9,150098260,Saturday,1/31/2015,CENTRAL,theft,2015


In [40]:
dropdup.shape

(273421, 6)

In [55]:
crime_df = dropdup.groupby(['Date', 'DayOfWeek'])['IncidntNum'].agg(['count']).reset_index()
np.sum(crime_df['count'].values)    # sanity check

273421

In [59]:
crime_df.rename(columns={"count": "num_unique_crimes"}, inplace=True)
crime_df.head()

Unnamed: 0,Date,DayOfWeek,num_unique_crimes
0,1/1/2012,Sunday,212
1,1/1/2013,Tuesday,233
2,1/1/2014,Wednesday,311
3,1/1/2015,Thursday,133
4,1/1/2016,Friday,1


## 4. Count number of each crime types

In [60]:
pd.get_dummies(df, columns=['PdDistrict','new_cat'])

Unnamed: 0,IncidntNum,DayOfWeek,Date,year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,...,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,new_cat_domestic,new_cat_drug,new_cat_kidnap,new_cat_others,new_cat_public,new_cat_sex,new_cat_suicide,new_cat_theft
1,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
3,150098210,Sunday,2/1/2015,2015,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,150098226,Tuesday,1/27/2015,2015,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
6,150098248,Saturday,1/31/2015,2015,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7,150098248,Saturday,1/31/2015,2015,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,150098254,Saturday,1/31/2015,2015,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,150098260,Saturday,1/31/2015,2015,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10,150098345,Sunday,2/1/2015,2015,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
11,150098345,Sunday,2/1/2015,2015,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
