In [1]:
# Purpose: to standardize addresses of City of Philadelphia datasets
# Author: Lauren Parker, adapted heavily from script by Amy Gottsegen (Drexel University)
# Last Update: December 26, 2017
# Notes: this script uses the address parser 'passyunk', developed by the City of Philadelphia and found at: 
#        https://github.com/CityOfPhiladelphia/passyunk

In [1]:
# Set up workspace and import packages
import pandas as pd
from passyunk.parser import PassyunkParser
from shapely.geometry import Point
import re,json



In [2]:
import geopandas as gpd

In [3]:
# Display plots in notebook
%matplotlib inline

In [4]:
path = "geodata/"

In [11]:
# Test passyunk address parser
p = PassyunkParser()
#components = p.parse('1234 MARKET ST')
address_components = p.parse('3825 ARCHER ST, PHILADELPHIA, PA')['components'] #returns dictionary of address components
#standardized_address = components['street_address']
address_components

{'address': {'addr_suffix': None,
  'addrnum_type': 'N',
  'fractional': None,
  'full': '3825',
  'high': None,
  'high_num': None,
  'high_num_full': None,
  'isaddr': True,
  'low': '3825',
  'low_num': 3825,
  'parity': 'O'},
 'address_unit': {'unit_num': None, 'unit_type': None},
 'base_address': '3825 ARCHER ST',
 'cl_addr_match': 'A',
 'cl_responsibility': None,
 'cl_seg_id': '620032',
 'election': {'blockid': None, 'precinct': None},
 'mailing': {'bldgfirm': None,
  'matchdesc': None,
  'uspstype': None,
  'zip4': None,
  'zipcode': None},
 'output_address': '3825 ARCHER ST',
 'street': {'full': 'ARCHER ST',
  'is_centerline_match': True,
  'name': 'ARCHER',
  'parse_method': '2ANS',
  'postdir': None,
  'predir': None,
  'score': None,
  'street_code': '13020',
  'suffix': 'ST'},
 'street_2': {'full': None,
  'is_centerline_match': False,
  'name': None,
  'parse_method': None,
  'postdir': None,
  'predir': None,
  'score': None,
  'street_code': None,
  'suffix': None}}

In [12]:
def initializeGeoDataFrame(resolution):
    p = path + {
        "parcel":"parcels/DOR_Parcel_WGS84.shp", 
        "blockgroup": "block_groups/Phila_BlockGroups_WGS84.shp",
        "street":"streets/Street_Centerline_WGS84.shp",
        "neighborhood":"neighborhoods/Neighborhoods_Phila_WGS84.shp",
        "zipcode":"zipcodes/Phila_ZCTA_WGS84.shp"
    }[resolution]
    gdf = gpd.read_file(p)
    return gdf.to_crs({'init': 'epsg:4326'}) # Sets coordinate system to WGS84

In [13]:
def geocodeWithOPA(data):
    opa = pd.read_csv('parcel_level/opa_properties_public_11072017.csv',encoding='utf-8')

In [14]:
def parseAddresses(location,zipcode):
    #if type(location) is not unicode: 
    #LP: in Py3, all strings are unicode, so not necessary? http://www.pgbovine.net/unicode-python.htm
    #    return
    features={} # set empty dictionary
    
    if type(location) is str:
        location = location.upper() #LP changed this bc: getting error that 
                                    #'float has no attribute upper', but data type is object (string)?
    
    #NEIGHBORHOOD
    try:
        features['neighborhood'] = []
        f = json.load(open('neighborhood_list.json','r'))
    except Exception as e:
        print(e)
        
    if f:
        neighborhoods = sorted(f['neighborhoods'],key=lambda x:len(x),reverse=True)
        for n in neighborhoods:
            if n in location:
                features['neighborhood'].append(n)
                location = location.replace(n,'')
    p = PassyunkParser()
    components = p.parse(location)['components']
    
    #ADDRESS
    if components['address']['isaddr']:
        features['address']= components['base_address']
    
    #BLOCK    
    blocksplit = re.split('\d+[Xx]+',location)
    if len(blocksplit) > 1:
        street = blocksplit[1]
        block = re.search('\d+X+',location).group(0).replace('X','0')
        features['block'] = block + street
    
    #STREET
    if components['street']['is_centerline_match']:
        features['street'] = components['street']['full']
    
    #INTERSECTION
    if components['street_2']['is_centerline_match']:
        features['street_2'] = components['street_2']['full']
        
    #and if all else fails... ZIPCODE
    features['zipcode'] = zipcode
    return features

In [15]:
def getNeighborhood(gdf, data):
    return gpd.sjoin(data,gdf[['NAME','geometry']],how="left",op="within")

In [16]:
def readData(path_std):
    df = pd.read_csv(path_std,usecols=['objectid','lat','lng'])
    df = df.loc[~df['lat'].isnull()]
    geometry = [Point(xy) for xy in zip(df.lng, df.lat)]
    df = df.drop(['lng', 'lat'], axis=1)
    crs = {'init': 'epsg:4326'}
    return gpd.GeoDataFrame(df, crs=crs, geometry=geometry)

In [17]:
gdf = initializeGeoDataFrame('neighborhood')
data = readData('parcel_level/permits/li_permits.csv')

In [None]:
crosswalk = getNeighborhood(gdf,data)
crosswalk.head()

In [None]:
crosswalk['NAME'].value_counts()

# Test code

In [18]:
df = pd.read_csv('parcel_level/foreclosures/tax_filings_full_list_11012016.csv')
df.dtypes # object data types are strings (but could include other types like factors)

id                     int64
docketnum             object
sought               float64
address               object
type                  object
status                object
court                 object
jury                  object
caption               object
date_filed            object
year_filed             int64
month_filed            int64
last_docket_entry     object
rulereturn            object
rulereturn_day        object
sheriff_deed          object
propcat               object
homestead             object
zip                  float64
latitude             float64
longitude            float64
latlong               object
dtype: object

In [19]:
# Convert all address records to strings
df['address'] = df['address'].astype(str) #convert to string
df['address'].apply(lambda x: len(x)) #check length to make sure all are strings (vs. float which doesn't have a length)
#df.dtypes #check types

0        31
1        29
2        32
3        32
4        35
5        32
6        35
7        32
8        32
9        32
10       36
11       31
12       35
13       32
14       32
15       32
16       37
17       32
18       34
19       33
20        3
21       27
22       32
23       31
24       33
25       34
26       33
27       30
28       32
29       34
         ..
59241     3
59242     3
59243     3
59244     3
59245     3
59246     3
59247     3
59248     3
59249     3
59250     3
59251     3
59252     3
59253     3
59254     3
59255     3
59256     3
59257     3
59258     3
59259     3
59260     3
59261     3
59262     3
59263     3
59264     3
59265     3
59266     3
59267     3
59268     3
59269     3
59270     3
Name: address, Length: 59271, dtype: int64

In [28]:
# Parse addresses
addr_dict = df['address'].apply(lambda x: parseAddresses(x,0))
addr_dict

0        {'neighborhood': [], 'address': '329 BUDD ST',...
1        {'neighborhood': [], 'address': '2930 KIP ST',...
2        {'neighborhood': [], 'address': '5356 MORRIS S...
3        {'neighborhood': [], 'address': '1656 N 59TH S...
4        {'neighborhood': [], 'address': '450 DEARBORN ...
5        {'neighborhood': [], 'address': '348 E ARMAT S...
6        {'neighborhood': [], 'address': '212 E INDIANA...
7        {'neighborhood': [], 'address': '3215 N 13TH S...
8        {'neighborhood': [], 'address': '5735 OSAGE AV...
9        {'neighborhood': [], 'address': '1619 N 55TH S...
10       {'neighborhood': [], 'address': '2430 W SEDGLE...
11       {'neighborhood': [], 'address': '2526 CORAL ST...
12       {'neighborhood': [], 'address': '4534 N BOUVIE...
13       {'neighborhood': [], 'address': '3820 N 18TH S...
14       {'neighborhood': [], 'address': '2143 RIDGE AV...
15       {'neighborhood': [], 'address': '5725 CEDAR AV...
16       {'neighborhood': [], 'address': '3525 KENSINGT.

In [36]:
# Add standardized address to data frame as columns
#df2 = pd.DataFrame(list(addr_dict.items()), columns=['neighborhood', 'location'])
#df2 = pd.DataFrame(addr_dict)
#df2 = pd.Series(addr_dict, name='address')

In [37]:
df2.head()

0    {'neighborhood': [], 'address': '329 BUDD ST',...
1    {'neighborhood': [], 'address': '2930 KIP ST',...
2    {'neighborhood': [], 'address': '5356 MORRIS S...
3    {'neighborhood': [], 'address': '1656 N 59TH S...
4    {'neighborhood': [], 'address': '450 DEARBORN ...
Name: address, dtype: object