# Importing libraries and Dataset

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from datetime import datetime
from collections import Counter

In [113]:
df = pd.read_csv('data/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv')
print(df.shape)
df.head()

(2215024, 13)


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)",15006027571000
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821003074
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821004014
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)",15009821015200
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)",15009822628160


## 1. Inspect data for any duplicates

To-dos:
* To calculate total number of crimes, make sure not to double count
* Double counting is alright if calculating based on crime types but delete if both incident_num and category are the same

From the metadata, we understand that `PdId` refers to the unique identifier for use in update and insert operations for the dataset, hence we can drop the column.

In [114]:
df=df.drop(['PdId'], axis=1)

In [115]:
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
0,150060275,NON-CRIMINAL,LOST PROPERTY,Monday,01/19/2015,14:00,MISSION,NONE,18TH ST / VALENCIA ST,-122.421582,37.761701,"(37.7617007179518, -122.42158168137)"
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
4,150098226,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,01/27/2015,19:00,NORTHERN,NONE,LOMBARD ST / LAGUNA ST,-122.431119,37.800469,"(37.8004687042875, -122.431118543788)"


In [116]:
# check unique no. of rows (corresponding to no. of incident numbers)
print("No. of rows:",df.shape[0])

No. of rows: 2215024


In [117]:
# check unique no. of incidents
print("No. of unique incidents:", df['IncidntNum'].nunique())

No. of unique incidents: 1746914


In [118]:
df.shape[0]-df['IncidntNum'].nunique()

468110

Hence we can conclude that there were 468110 duplicates in incident_num. Let's take a look at an example of a duplicate incident. The logic is that, if the duplicates arise due to the incident being categorized into various incidents of crime, we can retain the double-counting; however, if the duplicates are merely a result of same incident_num and category, we should delete these rows.

In [119]:
incidentNumList=Counter(df['IncidntNum']) # obtain dictionary of all unique key-value pairs 

In [120]:
incidentNumList

Counter({150060275: 1,
         150098210: 3,
         150098226: 1,
         150098232: 1,
         150098248: 2,
         150098254: 1,
         150098260: 1,
         150098345: 4,
         150098367: 1,
         150098373: 2,
         150098389: 1,
         150098395: 1,
         150098408: 1,
         150098414: 4,
         160919032: 1,
         150098420: 3,
         150098436: 1,
         150098442: 1,
         150098458: 3,
         140009459: 2,
         150098464: 1,
         150098470: 1,
         150098486: 1,
         150098492: 2,
         150098505: 1,
         150098511: 1,
         150098527: 3,
         150098533: 1,
         150098549: 1,
         150098555: 1,
         150098561: 1,
         150098577: 1,
         150098583: 1,
         150098599: 1,
         150098602: 3,
         150098618: 1,
         150098624: 1,
         150098630: 1,
         150098652: 2,
         150098668: 1,
         150098674: 3,
         150098680: 1,
         150098696: 1,
         15

In [121]:
allIncidentNumbers=df['IncidntNum'].unique().tolist()

In [122]:
len(incidentNumList) 

1746914

In [123]:
threshold=1
duplicateIncidentList=[]
isolatedIncidentList=[]
for incident, incidentCount in incidentNumList.items():
    if incidentCount>threshold:
        duplicateIncidentList.append(incident)
    elif incidentCount==threshold:
        isolatedIncidentList.append(incident)

# shorter code is this: 
# isolatedIncidents=[incd for incd, incidentCount in incidentNumList.items() if incidentCount == threshold]
# duplicateIncidents=[incd for incd, incidentCount in incidentNumList.items() if incidentCount > threshold]
# sumIsolates=len(isolatedIncidents)
# sumDuplicate=len(duplicateIncidents)

In [124]:
len(duplicateIncidentList)

341334

In [125]:
len(isolatedIncidentList)

1405580

In [126]:
'''check random incident_num with non-1s count to see reason for duplicates
    if incident_num and category are the same, then remove duplicates (later)
'''
df[df['IncidntNum']==150098345]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
10,150098345,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
11,150098345,DRUG/NARCOTIC,POSSESSION OF METH-AMPHETAMINE,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
12,150098345,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
13,150098345,WARRANTS,WARRANT ARREST,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"


In [127]:
df[df['IncidntNum']==150098210]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
1,150098210,ROBBERY,"ROBBERY, BODILY FORCE",Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
2,150098210,ASSAULT,AGGRAVATED ASSAULT WITH BODILY FORCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"
3,150098210,SECONDARY CODES,DOMESTIC VIOLENCE,Sunday,02/01/2015,15:45,TENDERLOIN,NONE,300 Block of LEAVENWORTH ST,-122.414406,37.784191,"(37.7841907151119, -122.414406029855)"


In [128]:
# check the original dataframe against 'duplicateIncidentList'
# when 'duplicateIncidentList' matches df['IncidntNum'], then check if the category is the same
# if the categories are the same, retain one row and drop the others
# if the categories are different, retain all rows 

df_duplicates=df[df['IncidntNum'].isin(allIncidentNumbers)].sort_values(by=['IncidntNum'])
df_duplicates_cleaned = df_duplicates.drop_duplicates(subset=['Category','IncidntNum'], keep="first")
df_duplicates_cleaned # this is the final dataframe without duplicates


Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
493897,3979,WARRANTS,WARRANT ARREST,Thursday,12/09/2004,16:22,BAYVIEW,"ARREST, BOOKED",INGALLS ST / HUDSON AV,-122.379598,37.732467,"(37.7324666541275, -122.379598260097)"
1209080,10128,WARRANTS,WARRANT ARREST,Sunday,12/18/2005,22:20,INGLESIDE,"ARREST, BOOKED",ELLSWORTH ST / ALEMANY BL,-122.418969,37.732208,"(37.7322083102079, -122.4189687006)"
1490233,10736,WARRANTS,WARRANT ARREST,Thursday,07/15/2004,10:18,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)"
1393001,38261,WARRANTS,WARRANT ARREST,Thursday,04/17/2003,22:45,NORTHERN,"ARREST, BOOKED",POLK ST / SUTTER ST,-122.420120,37.787757,"(37.7877570602182, -122.420120319211)"
377806,52205,WARRANTS,WARRANT ARREST,Thursday,02/06/2003,07:20,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)"
1333777,61397,WARRANTS,WARRANT ARREST,Sunday,06/29/2003,10:00,TENDERLOIN,"ARREST, BOOKED",100 Block of TURK ST,-122.411593,37.783053,"(37.7830529510782, -122.411593095704)"
596484,62389,VEHICLE THEFT,STOLEN MOTORCYCLE,Tuesday,03/30/2004,11:35,MISSION,NONE,2600 Block of BRYANT ST,-122.409121,37.751979,"(37.7519787472165, -122.409121399225)"
391902,62953,WARRANTS,WARRANT ARREST,Sunday,08/22/2004,18:36,SOUTHERN,"ARREST, BOOKED",COLINPKELLYJR ST / TOWNSEND ST,-122.389758,37.781221,"(37.781221050395, -122.389757937508)"
1485730,70796,WARRANTS,ENROUTE TO OUTSIDE JURISDICTION,Thursday,07/13/2006,08:45,SOUTHERN,"ARREST, BOOKED",900 Block of MISSION ST,-122.407933,37.781506,"(37.7815063666429, -122.407932868203)"
864778,73057,WARRANTS,WARRANT ARREST,Thursday,08/26/2004,12:00,SOUTHERN,"ARREST, BOOKED",800 Block of BRYANT ST,-122.403405,37.775421,"(37.775420706711, -122.403404791479)"


In [129]:
df=df_duplicates_cleaned # final dataframe
print("No. of rows and columns of final dataframe: ",df.shape)

No. of rows and columns of final dataframe:  (2057667, 12)


In [130]:
df[df['IncidntNum']==150098345]

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location
13,150098345,WARRANTS,WARRANT ARREST,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
12,150098345,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"
10,150098345,LARCENY/THEFT,PETTY THEFT SHOPLIFTING,Sunday,02/01/2015,14:00,MISSION,"ARREST, BOOKED",1700 Block of HARRISON ST,-122.413354,37.769075,"(37.7690748003847, -122.413354187018)"


## 2. Check the unique category of crimes
To-dos:
* Group similar crimes together

In [131]:
# check set of crimes
print(len(set(df['Category'])))
set(df['Category'])

39


{'ARSON',
 'ASSAULT',
 'BAD CHECKS',
 'BRIBERY',
 'BURGLARY',
 'DISORDERLY CONDUCT',
 'DRIVING UNDER THE INFLUENCE',
 'DRUG/NARCOTIC',
 'DRUNKENNESS',
 'EMBEZZLEMENT',
 'EXTORTION',
 'FAMILY OFFENSES',
 'FORGERY/COUNTERFEITING',
 'FRAUD',
 'GAMBLING',
 'KIDNAPPING',
 'LARCENY/THEFT',
 'LIQUOR LAWS',
 'LOITERING',
 'MISSING PERSON',
 'NON-CRIMINAL',
 'OTHER OFFENSES',
 'PORNOGRAPHY/OBSCENE MAT',
 'PROSTITUTION',
 'RECOVERED VEHICLE',
 'ROBBERY',
 'RUNAWAY',
 'SECONDARY CODES',
 'SEX OFFENSES, FORCIBLE',
 'SEX OFFENSES, NON FORCIBLE',
 'STOLEN PROPERTY',
 'SUICIDE',
 'SUSPICIOUS OCC',
 'TREA',
 'TRESPASS',
 'VANDALISM',
 'VEHICLE THEFT',
 'WARRANTS',
 'WEAPON LAWS'}

## 1. Reduce number of crime categories

Group the criminal activities into the following categories:
1. theft
2. public disturbance
3. drug
4. sex
5. suicide
6. kidnap
7. domestic
8. other
9. non-criminal (can delete)

In [132]:
theft = ['BRIBERY',
        'BURGLARY',
        'EXTORTION',        
        'GAMBLING',
        'LARCENY/THEFT',
        'RECOVERED VEHICLE',
        'ROBBERY',
        'TREA',
        'VEHICLE THEFT',
        'STOLEN PROPERTY',
        'ARSON']

fraud = ['FRAUD',
        'EMBEZZLEMENT',
        'FORGERY/COUNTERFEITING']

public = ['TRESPASS',
        'VANDALISM',
        'WARRANTS',
        'LOITERING',
        'ASSAULT',
        'BAD CHECKS',
        'DISORDERLY CONDUCT',
        'DRIVING UNDER THE INFLUENCE',
        'DRUNKENNESS',
        'SUSPICIOUS OCC',
         'LIQUOR LAWS',
         'WEAPON LAWS']

drug =  ['DRUG/NARCOTIC']

sex = ['PORNOGRAPHY/OBSCENE MAT',
        'PROSTITUTION',
        'SEX OFFENSES, FORCIBLE',
        'SEX OFFENSES, NON FORCIBLE']

suicide = ['SUICIDE']

kidnap = ['KIDNAPPING']

domestic = ['MISSING PERSON',
            'RUNAWAY',
            'FAMILY OFFENSES']

others = ['OTHER OFFENSES',
        'SECONDARY CODES']

non_criminal = ['NON-CRIMINAL']

In [133]:
replace_dict = {
                "theft": theft,
                "fraud": fraud,
                "public": public,
                "drug": drug,
                "sex": sex,
                "suicide": suicide,
                "kidnap":kidnap,
                "domestic": domestic,
                "others": others,
                "non_criminal": non_criminal
               }

In [134]:
new_cat = df['Category']

for key, val in replace_dict.items():
    new_cat = new_cat.replace(val, key)

df["New Category"] = new_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [135]:
set(df["New Category"])

{'domestic',
 'drug',
 'fraud',
 'kidnap',
 'non_criminal',
 'others',
 'public',
 'sex',
 'suicide',
 'theft'}

In [136]:
df.head()

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,New Category
493897,3979,WARRANTS,WARRANT ARREST,Thursday,12/09/2004,16:22,BAYVIEW,"ARREST, BOOKED",INGALLS ST / HUDSON AV,-122.379598,37.732467,"(37.7324666541275, -122.379598260097)",public
1209080,10128,WARRANTS,WARRANT ARREST,Sunday,12/18/2005,22:20,INGLESIDE,"ARREST, BOOKED",ELLSWORTH ST / ALEMANY BL,-122.418969,37.732208,"(37.7322083102079, -122.4189687006)",public
1490233,10736,WARRANTS,WARRANT ARREST,Thursday,07/15/2004,10:18,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)",public
1393001,38261,WARRANTS,WARRANT ARREST,Thursday,04/17/2003,22:45,NORTHERN,"ARREST, BOOKED",POLK ST / SUTTER ST,-122.42012,37.787757,"(37.7877570602182, -122.420120319211)",public
377806,52205,WARRANTS,WARRANT ARREST,Thursday,02/06/2003,07:20,SOUTHERN,"ARREST, BOOKED",900 Block of BRYANT ST,-122.405927,37.773427,"(37.7734271309539, -122.405926775837)",public


In [137]:
# drop redundant columns
df.drop(columns=['Category','Descript','Resolution','Address','X','Y','Location'], inplace=True)
# keep time for now because we will need it to combine it with the sunset/sunrise timings

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [138]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category
493897,3979,Thursday,12/09/2004,16:22,BAYVIEW,public
1209080,10128,Sunday,12/18/2005,22:20,INGLESIDE,public
1490233,10736,Thursday,07/15/2004,10:18,SOUTHERN,public
1393001,38261,Thursday,04/17/2003,22:45,NORTHERN,public
377806,52205,Thursday,02/06/2003,07:20,SOUTHERN,public


In [139]:
df.shape

(2057667, 6)

In [140]:
# remove non_criminal
df = df[df["New Category"]!='non_criminal']
df.shape

(1823419, 6)

## 2. Select data between 2012-2017

In [141]:
df['year'] = df['Date'].map(lambda x: pd.to_datetime(x).year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [142]:
set(df['year'])

{2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018}

In [143]:
df = df[(df['year']>=2012) & (df['year']<=2017)]

In [144]:
set(df['year'])

{2012, 2013, 2014, 2015, 2016, 2017}

In [145]:
# no. of rows of data for 2012-2017
df.shape

(733179, 7)

In [146]:
df.to_csv ('SFCrime_2012-2017.csv', index = None, header=True)

## 3. Count number of unique crimes each day

In [147]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year
75786,1300164,Thursday,02/13/2014,09:00,RICHMOND,domestic,2014
1185245,10073348,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013
769622,10358128,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013
1714235,10733172,Thursday,03/01/2012,13:53,MISSION,public,2012
2126742,11351210,Tuesday,06/14/2016,08:55,BAYVIEW,domestic,2016


In [148]:
crime_df = df.groupby(['Date', 'DayOfWeek'])['IncidntNum'].agg(['count']).reset_index()
np.sum(crime_df['count'].values)    # sanity check

733179

In [149]:
crime_df.rename(columns={"count": "num_unique_crimes"}, inplace=True)
crime_df.head()

Unnamed: 0,Date,DayOfWeek,num_unique_crimes
0,01/01/2012,Sunday,420
1,01/01/2013,Tuesday,459
2,01/01/2014,Wednesday,385
3,01/01/2015,Thursday,412
4,01/01/2016,Friday,444


## 4. Count number of each crime types

In [150]:
pd.get_dummies(df, columns=['PdDistrict','New Category'])

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,year,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,...,PdDistrict_TENDERLOIN,New Category_domestic,New Category_drug,New Category_fraud,New Category_kidnap,New Category_others,New Category_public,New Category_sex,New Category_suicide,New Category_theft
75786,1300164,Thursday,02/13/2014,09:00,2014,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1185245,10073348,Thursday,11/14/2013,10:20,2013,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
769622,10358128,Thursday,11/14/2013,10:20,2013,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
1714235,10733172,Thursday,03/01/2012,13:53,2012,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2126742,11351210,Tuesday,06/14/2016,08:55,2016,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
244052,11473002,Tuesday,04/17/2012,10:11,2012,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1354088,12024759,Monday,03/26/2012,05:30,2012,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
333040,12036049,Sunday,05/06/2012,02:00,2012,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
923224,12603347,Wednesday,03/07/2012,17:00,2012,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
248821,13082306,Monday,09/30/2013,13:30,2013,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## 5. Import housing price dataset

In [151]:
housingprice_df = pd.read_csv('data/HousingPriceFinal.csv')

In [152]:
housingprice_df.head()

Unnamed: 0,Inventory,Median Dom,Median Sale,New Listings,Period End,DoW,Neighborhood,PdDistrict
0,7.0,59.5,1500000,15.0,2/29/12,Wed,Ashbury Heights / Parnassus,PARK
1,8.0,145.0,293000,7.0,2/29/12,Wed,Bret Harte,BAYVIEW
2,4.0,69.0,685000,12.0,2/29/12,Wed,Buena Vista,PARK
3,13.0,72.0,316000,19.0,2/29/12,Wed,Cathedral Hill,NORTHERN
4,7.0,59.5,552000,10.0,2/29/12,Wed,Cayuga,INGLESIDE


In [153]:
# converting to datetime format, and obtaining columns for 'year', 'month' and 'day'
housingprice_df['year'] = housingprice_df['Period End'].map(lambda x: pd.to_datetime(x).year)
housingprice_df['month'] = housingprice_df['Period End'].map(lambda x: pd.to_datetime(x).month)
housingprice_df['date'] = housingprice_df['Period End'].map(lambda x: pd.to_datetime(x).day)

In [154]:
housingprice_df.head()

Unnamed: 0,Inventory,Median Dom,Median Sale,New Listings,Period End,DoW,Neighborhood,PdDistrict,year,month,date
0,7.0,59.5,1500000,15.0,2/29/12,Wed,Ashbury Heights / Parnassus,PARK,2012,2,29
1,8.0,145.0,293000,7.0,2/29/12,Wed,Bret Harte,BAYVIEW,2012,2,29
2,4.0,69.0,685000,12.0,2/29/12,Wed,Buena Vista,PARK,2012,2,29
3,13.0,72.0,316000,19.0,2/29/12,Wed,Cathedral Hill,NORTHERN,2012,2,29
4,7.0,59.5,552000,10.0,2/29/12,Wed,Cayuga,INGLESIDE,2012,2,29


In [155]:
housingprice_df=housingprice_df.drop(['Period End','DoW','Neighborhood'], axis=1)

In [156]:
housingprice_df.head()

Unnamed: 0,Inventory,Median Dom,Median Sale,New Listings,PdDistrict,year,month,date
0,7.0,59.5,1500000,15.0,PARK,2012,2,29
1,8.0,145.0,293000,7.0,BAYVIEW,2012,2,29
2,4.0,69.0,685000,12.0,PARK,2012,2,29
3,13.0,72.0,316000,19.0,NORTHERN,2012,2,29
4,7.0,59.5,552000,10.0,INGLESIDE,2012,2,29


In [157]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year
75786,1300164,Thursday,02/13/2014,09:00,RICHMOND,domestic,2014
1185245,10073348,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013
769622,10358128,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013
1714235,10733172,Thursday,03/01/2012,13:53,MISSION,public,2012
2126742,11351210,Tuesday,06/14/2016,08:55,BAYVIEW,domestic,2016


In [158]:
df.shape[0]

733179

In [159]:
df['month'] = df['Date'].map(lambda x: pd.to_datetime(x).month)
df['date'] = df['Date'].map(lambda x: pd.to_datetime(x).day)

##### To do:
1) Combine Housing['Period End'] to datetime format
2) Join on these columns: year, month, date, PdDistrict

In [160]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year,month,date
75786,1300164,Thursday,02/13/2014,09:00,RICHMOND,domestic,2014,2,13
1185245,10073348,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14
769622,10358128,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14
1714235,10733172,Thursday,03/01/2012,13:53,MISSION,public,2012,3,1
2126742,11351210,Tuesday,06/14/2016,08:55,BAYVIEW,domestic,2016,6,14


In [161]:
df=df.reset_index(drop=True)
housingprice_df=housingprice_df.reset_index(drop=True)

In [162]:
df.head()

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year,month,date
0,1300164,Thursday,02/13/2014,09:00,RICHMOND,domestic,2014,2,13
1,10073348,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14
2,10358128,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14
3,10733172,Thursday,03/01/2012,13:53,MISSION,public,2012,3,1
4,11351210,Tuesday,06/14/2016,08:55,BAYVIEW,domestic,2016,6,14


In [164]:
df.shape[0]

733179

In [163]:
housingprice_df.head()

Unnamed: 0,Inventory,Median Dom,Median Sale,New Listings,PdDistrict,year,month,date
0,7.0,59.5,1500000,15.0,PARK,2012,2,29
1,8.0,145.0,293000,7.0,BAYVIEW,2012,2,29
2,4.0,69.0,685000,12.0,PARK,2012,2,29
3,13.0,72.0,316000,19.0,NORTHERN,2012,2,29
4,7.0,59.5,552000,10.0,INGLESIDE,2012,2,29


In [166]:
combinedDF = pd.merge(df, housingprice_df, on=['year','month','date','PdDistrict'],how='left')
combinedDF.sort_values(by=['IncidntNum'])

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year,month,date,Inventory,Median Dom,Median Sale,New Listings
0,1300164,Thursday,02/13/2014,09:00,RICHMOND,domestic,2014,2,13,,,,
1,10073348,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14,,,,
2,10358128,Thursday,11/14/2013,10:20,TENDERLOIN,others,2013,11,14,,,,
3,10733172,Thursday,03/01/2012,13:53,MISSION,public,2012,3,1,,,,
4,11351210,Tuesday,06/14/2016,08:55,BAYVIEW,domestic,2016,6,14,,,,
5,11473002,Tuesday,04/17/2012,10:11,SOUTHERN,public,2012,4,17,,,,
6,12024759,Monday,03/26/2012,05:30,NORTHERN,domestic,2012,3,26,,,,
7,12036049,Sunday,05/06/2012,02:00,RICHMOND,theft,2012,5,6,,,,
8,12603347,Wednesday,03/07/2012,17:00,RICHMOND,theft,2012,3,7,,,,
14,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,3.0,32.0,2350000.0,5.0


In [176]:
combinedDF=combinedDF.dropna()

In [177]:
combinedDF

Unnamed: 0,IncidntNum,DayOfWeek,Date,Time,PdDistrict,New Category,year,month,date,Inventory,Median Dom,Median Sale,New Listings
9,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,15.0,16.0,1131000.0,44.0
10,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,4.0,13.0,851000.0,9.0
11,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,6.0,12.5,1185000.0,12.0
12,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,254.0,15.0,1038000.0,616.0
13,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,33.0,18.5,950000.0,62.0
14,13082306,Monday,09/30/2013,13:30,RICHMOND,theft,2013,9,30,3.0,32.0,2350000.0,5.0
60,21549483,Monday,04/30/2012,14:35,SOUTHERN,public,2012,4,30,18.0,40.5,590000.0,32.0
61,21549483,Monday,04/30/2012,14:35,SOUTHERN,public,2012,4,30,20.0,12.0,780000.0,53.0
62,21549483,Monday,04/30/2012,14:35,SOUTHERN,public,2012,4,30,48.0,16.0,716000.0,113.0
63,21549483,Monday,04/30/2012,14:35,SOUTHERN,public,2012,4,30,35.0,35.0,519000.0,65.0


In [179]:
combinedDF.to_csv('data/CrimeAndHousing.csv', index = None, header=True)

## 6. Import Eviction Dataset

In [223]:
eviction_df = pd.read_csv('data/eviction_minus.csv',header=0)

In [224]:
eviction_df.shape

(40091, 30)

In [225]:
pd.set_option('display.max_columns', 30)
eviction_df.head()

Unnamed: 0,Eviction.ID,Address,City,State,Eviction.Notice.Source.Zipcode,File.Date,Non.Payment,Breach,Nuisance,Illegal.Use,Failure.to.Sign.Renewal,Access.Denial,Unapproved.Subtenant,Owner.Move.In,Demolition,Capital.Improvement,Substantial.Rehab,Ellis.Act.WithDrawal,Condo.Conversion,Roommate.Same.Unit,Other.Cause,Late.Payments,Lead.Remediation,Development,Good.Samaritan.Ends,Constraints.Date,Supervisor.District,Neighborhoods...Analysis.Boundaries,Location,PdDistrict
0,M190330,1800 Block Of Market Street,San Francisco,CA,94102.0,2/15/19,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,8.0,Hayes Valley,"(37.7712923782185, -122.42433694872517)",SOUTHERN
1,M183362,1100 Block Of Mission Street,San Francisco,CA,94103.0,12/20/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,South of Market,"(37.77779030910167, -122.41283433258292)",SOUTHERN
2,M183157,0 Block Of Brady Street,San Francisco,CA,94103.0,11/21/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Mission,"(37.772548429191886, -122.42008435391327)",SOUTHERN
3,M182998,0 Block Of Stillman Street,San Francisco,CA,94107.0,10/29/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,"(37.78258788740649, -122.3952163164336)",SOUTHERN
4,M182997,0 Block Of Stillman Street,San Francisco,CA,94107.0,10/29/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,"(37.78258788740649, -122.3952163164336)",SOUTHERN


In [226]:
eviction_df=eviction_df.drop(['Address','City','State','Location'], axis=1)

In [227]:
eviction_df.head()

Unnamed: 0,Eviction.ID,Eviction.Notice.Source.Zipcode,File.Date,Non.Payment,Breach,Nuisance,Illegal.Use,Failure.to.Sign.Renewal,Access.Denial,Unapproved.Subtenant,Owner.Move.In,Demolition,Capital.Improvement,Substantial.Rehab,Ellis.Act.WithDrawal,Condo.Conversion,Roommate.Same.Unit,Other.Cause,Late.Payments,Lead.Remediation,Development,Good.Samaritan.Ends,Constraints.Date,Supervisor.District,Neighborhoods...Analysis.Boundaries,PdDistrict
0,M190330,94102.0,2/15/19,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,8.0,Hayes Valley,SOUTHERN
1,M183362,94103.0,12/20/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,South of Market,SOUTHERN
2,M183157,94103.0,11/21/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Mission,SOUTHERN
3,M182998,94107.0,10/29/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,SOUTHERN
4,M182997,94107.0,10/29/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,SOUTHERN


In [228]:
eviction_df['year']=eviction_df['File.Date'].map(lambda x: pd.to_datetime(x).year)
eviction_df['month'] = eviction_df['File.Date'].map(lambda x: pd.to_datetime(x).month)
eviction_df['date'] = eviction_df['File.Date'].map(lambda x: pd.to_datetime(x).day)


In [222]:
eviction_df.head()

Unnamed: 0,Eviction.ID,Eviction.Notice.Source.Zipcode,File.Date,Non.Payment,Breach,Nuisance,Illegal.Use,Failure.to.Sign.Renewal,Access.Denial,Unapproved.Subtenant,Owner.Move.In,Demolition,Capital.Improvement,Substantial.Rehab,Ellis.Act.WithDrawal,Condo.Conversion,Roommate.Same.Unit,Other.Cause,Late.Payments,Lead.Remediation,Development,Good.Samaritan.Ends,Constraints.Date,Supervisor.District,Neighborhoods...Analysis.Boundaries,PdDistrict,year,month,date
0,M190330,94102.0,2/15/19,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,8.0,Hayes Valley,,2019,2,15
1,M183362,94103.0,12/20/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,South of Market,,2018,12,20
2,M183157,94103.0,11/21/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Mission,,2018,11,21
3,M182998,94107.0,10/29/18,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,,2018,10,29
4,M182997,94107.0,10/29/18,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,,6.0,Financial District/South Beach,,2018,10,29
