## Mapping the Data

In [9]:
import pandas as pd
import requests

In [10]:
import geopandas as gpd

In [11]:
import geopy
from geopy.geocoders import Nominatim

In [14]:
# Test geocoding, reference: https://towardsdatascience.com/geocode-with-python-161ec1e62b89
locator = Nominatim(user_agent= "myGeocoder")
location = locator.geocode("Champ de Mars, Paris, France")

In [16]:
print("Latitude = {}, Longitude = {}".format(location.latitude, location.longitude))

Latitude = 48.85614465, Longitude = 2.297820393322227


In [16]:
# Geocoding addresses from Pandas
# Import citation_st_cleaning_5yr.csv
st_cleaning_5yr_map = pd.read_csv('citation_st_cleaning_5yr.csv',
                              dtype={'Citation Number': object} ,
                              parse_dates=['Citation Issued DateTime'])

In [17]:
st_cleaning_5yr_map.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2376891 entries, 0 to 2376890
Data columns (total 15 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   Citation Number               object        
 1   Citation Issued DateTime      datetime64[ns]
 2   Violation                     object        
 3   Violation Description         object        
 4   Citation Location             object        
 5   Vehicle Plate State           object        
 6   Vehicle Plate                 object        
 7   Fine Amount                   float64       
 8   Date Added                    object        
 9   geom                          object        
 10  Neighborhoods                 float64       
 11  SF Find Neighborhoods         float64       
 12  Current Police Districts      float64       
 13  Current Supervisor Districts  float64       
 14  Analysis Neighborhoods        float64       
dtypes: datetime64[ns](1), float64(6)

In [18]:
# Drop unused columns
st_cleaning_map = st_cleaning_5yr_map.drop(
    st_cleaning_5yr.loc[:, 'Neighborhoods':'Current Supervisor Districts'].columns,axis = 1)
del st_cleaning_map['Violation']
del st_cleaning_map['Vehicle Plate State']
del st_cleaning_map['Vehicle Plate']
st_cleaning_map.info()

KeyError: 'Neighborhoods'

In [31]:
# Add new column with complete address for 'Citation Location' for geocoding
st_cleaning_map['Location_new'] = st_cleaning_map['Citation Location'].astype(str) + ', San Francisco, USA'

In [37]:
st_cleaning_map.head()

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation Description,Citation Location,Fine Amount,Date Added,geom,Analysis Neighborhoods,Location_new
0,949367440,2022-03-23 02:33:00,STR CLEAN,720 TURK ST,84.0,11/16/2022 12:00:00 AM,,,"720 TURK ST, San Francisco, USA"
1,949375335,2022-03-24 09:30:00,STR CLEAN,19 RIVAS AVE,84.0,11/16/2022 12:00:00 AM,,,"19 RIVAS AVE, San Francisco, USA"
2,949395613,2022-03-21 13:05:00,STR CLEAN,1405 BAKER ST,84.0,11/16/2022 12:00:00 AM,,,"1405 BAKER ST, San Francisco, USA"
3,949411481,2022-03-29 00:15:00,STR CLEAN,420 NATOMA ST,84.0,11/16/2022 12:00:00 AM,,,"420 NATOMA ST, San Francisco, USA"
4,949412855,2022-03-22 12:25:00,STR CLEAN,717 WALLER ST,84.0,11/16/2022 12:00:00 AM,,,"717 WALLER ST, San Francisco, USA"


In [41]:
st_cleaning_map_2022 = st_cleaning_map[st_cleaning_map['Citation Issued DateTime'] >= '2022-01-01'].copy()
st_cleaning_map_2022.sort_values(by=['Citation Issued DateTime']).reset_index(drop=True)

Unnamed: 0,Citation Number,Citation Issued DateTime,Violation Description,Citation Location,Fine Amount,Date Added,geom,Analysis Neighborhoods,Location_new
0,946829295,2022-01-02 00:42:00,STR CLEAN,377 KING ST,85.0,01/09/2022 12:00:00 AM,,,"377 KING ST, San Francisco, USA"
1,946789082,2022-01-02 00:49:00,STR CLEAN,521 EMBARCADERO SOUTH,85.0,01/27/2022 12:00:00 AM,,,"521 EMBARCADERO SOUTH, San Francisco, USA"
2,946829321,2022-01-02 02:05:00,STR CLEAN,1131 MISSION ST,85.0,04/04/2022 12:00:00 AM,,,"1131 MISSION ST, San Francisco, USA"
3,946789104,2022-01-02 02:05:00,STR CLEAN,1147 MISSION ST,85.0,09/11/2022 12:00:00 AM,,,"1147 MISSION ST, San Francisco, USA"
4,946789093,2022-01-02 02:05:00,STR CLEAN,1149 MISSION ST,85.0,07/14/2022 12:00:00 AM,,,"1149 MISSION ST, San Francisco, USA"
...,...,...,...,...,...,...,...,...,...
440661,957285862,2022-11-16 12:44:00,STR CLEAN,1601 BRODERICK STREET,87.0,11/22/2022 12:00:00 AM,,,"1601 BRODERICK STREET, San Francisco, USA"
440662,957212222,2022-11-16 12:45:00,STR CLEAN,1501 BRODERICK ST,87.0,11/22/2022 12:00:00 AM,,,"1501 BRODERICK ST, San Francisco, USA"
440663,957148264,2022-11-16 13:12:00,STR CLEAN,1160 REVERE AVENUE,87.0,11/22/2022 12:00:00 AM,,,"1160 REVERE AVENUE, San Francisco, USA"
440664,957240712,2022-11-16 13:18:00,STR CLEAN,2444 15TH AVE,87.0,11/22/2022 12:00:00 AM,,,"2444 15TH AVE, San Francisco, USA"


In [43]:
st_cleaning_map_2022.to_csv('st_cleaning_2022.csv', index=False)

In [1]:
# # Reference: https://towardsdatascience.com/geocode-with-python-161ec1e62b89
# # Failed
# from geopy.extra.rate_limiter import RateLimiter

# # 1 - conveneint function to delay between geocoding calls
# geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# # 2- - create location column
# st_cleaning_map_2022['location'] = st_cleaning_map_2022['Location_new'].apply(geocode)
# # 3 - create longitude, laatitude and altitude from location column (returns tuple)
# st_cleaning_map_2022['point'] = st_cleaning_map_2022['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# # 4 - split point column into latitude, longitude and altitude columns
# st_cleaning_map_2022[['latitude', 'longitude', 'altitude']] = pd.DataFrame(st_cleaning_map_2022['point'].tolist(), index=df.index)

In [8]:
# Geocoding addresses from Pandas
# Import st_cleaning_2022.csv
st_cleaning_map_2022 = pd.read_csv('st_cleaning_2022.csv',
                              dtype={'Citation Number': object} ,
                              parse_dates=['Citation Issued DateTime'])

  st_cleaning_map_2022 = pd.read_csv('st_cleaning_2022.csv',
