# Jersey City Notebook

### These notebooks are my attempt to clean and geocode property tax records
### This data will be used by MapBoxGL 
### This is not a robust or data-scientific approach; I was shooting in the dark, 
### cleaning strings individually until I returned fewer and fewer errors.
### Not all errors have been corrected. 

## loading the dataframes

In [1]:
import pandas as pd 
import numpy as np
import sys
import os

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
geolocator = Nominatim(user_agent="myGeolocator", timeout=2)


# loading saved objects into dataframe
errors_df = pd.read_pickle('errors_df.pkl')
df = pd.read_pickle('jersey_city.pkl')

# dataframe for public housing:
# eventually, these dataframes will be joined in someway which preserves both property types
# until then I will be dealing with them separately
# public_housing_df = pd.read_csv('JerseyCity_PublicHousing.csv')
public_housing_df = pd.read_pickle('public_housing_df.pkl')

In [2]:
# appending the path to access my helper functions
sys.path.append('/Users/kylereaves/src/landlord_data/')

import helpers

In [13]:
def get_properties(owner):
    return df[df.owner_name == owner]['street_address'].tolist()

In [129]:
df['list_properties_owned'] = [get_properties(owner) for owner in df.owner_name]

In [46]:
df

Unnamed: 0,street_address,owner_name,owner_mailing_address,city_state_zip,owner_full_mailing_address,property_full_address,number_properties_owned,units,g_code,latitude,longitude,list_properties_owned
0,677 LIBERTY AVE.,"PEDDI, PRADEEP",677 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","677 LIBERTY AVE., JERSEY CITY, N.J. 07307","677 LIBERTY AVE. Jersey City, NJ",1,1,"(677, Liberty Avenue, Jersey City, Hudson Coun...",40.759744,-74.050457,[677 LIBERTY AVE.]
1,675 LIBERTY AVE.,"PAREJA, HENRY A. & MIRYAM C.",675 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","675 LIBERTY AVE., JERSEY CITY, N.J. 07307","675 LIBERTY AVE. Jersey City, NJ",1,1,"(675, Liberty Avenue, Jersey City, Hudson Coun...",40.759679,-74.050499,[675 LIBERTY AVE.]
2,673 LIBERTY AVE.,"HIRPARA, PRAVIN",673 LIBERTY AVE.,"JERSEY CITY, NJ 07307","673 LIBERTY AVE., JERSEY CITY, NJ 07307","673 LIBERTY AVE. Jersey City, NJ",1,1,"(673, Liberty Avenue, Jersey City, Hudson Coun...",40.759614,-74.050542,[673 LIBERTY AVE.]
3,671 LIBERTY AVE.,"SINGH, HONEY",671 LIBERTY AVE.,"JERSEY CITY, NJ 07307","671 LIBERTY AVE., JERSEY CITY, NJ 07307","671 LIBERTY AVE. Jersey City, NJ",1,1,"(671, Liberty Avenue, Jersey City, Hudson Coun...",40.759549,-74.050584,[671 LIBERTY AVE.]
4,669 LIBERTY AVE.,"MARTIN, BENIGNO & CARIDAD",669 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","669 LIBERTY AVE., JERSEY CITY, N.J. 07307","669 LIBERTY AVE. Jersey City, NJ",1,1,"(669, Liberty Avenue, Jersey City, Hudson Coun...",40.759484,-74.050626,[669 LIBERTY AVE.]
...,...,...,...,...,...,...,...,...,...,...,...,...
42025,100 EAST SHEARWATER CT.,"BALANLAYOS, JESUS & MYRNA",31 HARMONY WAY,"SEWAREN, NJ 07077","31 HARMONY WAY, SEWAREN, NJ 07077","100 EAST SHEARWATER CT. Jersey City, NJ",2,1,"(100, East Shearwater Court, Jersey City, Huds...",40.686674,-74.071935,"[100 EAST SHEARWATER CT., 100 EAST SHEARWATER ..."
42026,100 EAST SHEARWATER CT.,"DANGCIL, PROCERFINA & WILLIAM A JR",100-51 E SHEARWATER CT.,"JERSEY CITY, NJ 07305","100-51 E SHEARWATER CT., JERSEY CITY, NJ 07305","100 EAST SHEARWATER CT. Jersey City, NJ",2,1,"(100, East Shearwater Court, Jersey City, Huds...",40.686674,-74.071935,"[100 EAST SHEARWATER CT., 100 EAST SHEARWATER ..."
42027,100 EAST SHEARWATER CT.,"LAWSONS REALTY,LLC",130 SHORE RD #198,"PORT WASHINGTON, NY 11050","130 SHORE RD #198, PORT WASHINGTON, NY 11050","100 EAST SHEARWATER CT. Jersey City, NJ",2,1,"(100, East Shearwater Court, Jersey City, Huds...",40.686674,-74.071935,"[100 EAST SHEARWATER CT., 100 EAST SHEARWATER ..."
42028,100 EAST SHEARWATER CT.,"NATHAN, NAVIN D.",100-54 SHEARWATER CT.,"JERSEY CITY, NJ 07305","100-54 SHEARWATER CT., JERSEY CITY, NJ 07305","100 EAST SHEARWATER CT. Jersey City, NJ",2,1,"(100, East Shearwater Court, Jersey City, Huds...",40.686674,-74.071935,"[100 EAST SHEARWATER CT., 100 EAST SHEARWATER ..."


In [6]:
# using convert_dtypes() to infer types from columns
df = df.convert_dtypes(infer_objects=True)

In [7]:
df.rename(columns={'propertyLocation': 'street_address',
                   'ownersName': 'owner_name',
                   'ownersMailingAddress': 'owner_mailing_address',
                   'cityStateZip': 'city_state_zip',
                   'ownersFullMailingAddress': 'owner_full_mailing_address',
                   'propertyFullAddress': 'property_full_address',
                   'propertiesOwned': 'number_properties_owned',
                   'gCode': 'g_code'
                   }, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42030 entries, 0 to 38280
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   street_address              42030 non-null  string 
 1   owner_name                  42030 non-null  string 
 2   owner_mailing_address       42030 non-null  string 
 3   city_state_zip              42030 non-null  string 
 4   owner_full_mailing_address  42030 non-null  string 
 5   property_full_address       42030 non-null  string 
 6   number_properties_owned     42030 non-null  Int64  
 7   units                       42030 non-null  Int64  
 8   g_code                      42030 non-null  object 
 9   latitude                    42030 non-null  Float64
 10  longitude                   42030 non-null  Float64
dtypes: Float64(2), Int64(2), object(1), string(6)
memory usage: 4.0+ MB


In [15]:
df['list_properties_owned'] = [get_properties(owner) for owner in df.owner_name]

In [10]:
df.reset_index(inplace=True, drop=True)

In [39]:
df['number_properties_owned'] = [len(df.list_properties_owned.iloc[i]) for i in range(0, df.index[-1]+1)]

In [29]:
len(df.list_properties_owned.iloc[-1])

2

In [34]:
public_housing_df

Unnamed: 0,projectName,propertyLocation,managedBy,managerName,units,propertiesOwned,gCode,associatedProperties,lat,long
0,Marion Gardens,"57 Dales Avenue, Jersey City, NJ 07306-6807",Jersey City Housing Authority,Brenda Griffin,233,18,"(57, Dales Avenue, Marion, Jersey City, Hudson...","[57 Dales Avenue, Jersey City, NJ 07306-6807]",40.735895,-74.076336
1,Booker T. Washington Apartments,"200 Colden Street, Jersey City, 07302",Jersey City Housing Authority,Louranett George,319,18,"(200, Colden Street, Jersey City, Hudson Count...","[200 Colden Street, Jersey City, 07302]",40.718394,-74.058174
2,Hudson Gardens,"27 Palisades Avenue, Jersey City, 07306",Jersey City Housing Authority,Mariela Ramos,224,18,"(27, Palisades Avenue, Cresskill, Bergen Count...","[27 Palisades Avenue, Jersey City, 07306]",40.938779,-73.959089
3,Holland Gardens,"241 16th St, Jersey City, 07310",Jersey City Housing Authority,Elizabeth Nunez,192,18,"(241, 16th Street, Jersey City, Hudson County,...","[241 16th St, Jersey City, 07310, 88-92 Erie S...",40.733156,-74.043145
4,Curries Woods,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,91,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179
5,Curries Woods - Phase I Townhouses,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,46,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179
6,Curries Woods - Phase II Townhouses,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,20,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179
7,Curries Woods - Phase III Townhouses,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,18,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179
8,Curries Woods - Phase IV Townhouses,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,40,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179
9,Curries Woods - Phase V Townhouses,"3 New Heckman Drive, Jersey City, 07305",Jersey City Housing Authority,Maria Serrano,80,18,"(3, New Heckman Drive, Jersey City, Hudson Cou...","[3 New Heckman Drive, Jersey City, 07305, 3 Ne...",40.690504,-74.099179


In [35]:
# public_housing_df['gCode'] = public_housing_df.propertyLocation.apply(geolocator.geocode)

# def getAssociatedProperties(manager_name=str):
#    return public_housing_df[public_housing_df.managerName == manager_name]['propertyLocation'].tolist()

# public_housing_df['associatedProperties'] = [getAssociatedProperties(manager) for manager in public_housing_df.managerName]
# public_housing_df['lat'] = [g.latitude for g in public_housing_df.gCode]
# public_housing_df['long'] = [g.longitude for g in public_housing_df.gCode]
# public_housing_df.to_pickle('public_housing_df.pkl')

### check to see if there are any addresses beyond 'Jersey City'
### if so, append them to errors_df

In [44]:
df[df.g_code.str.contains('Jersey City') == False]

Unnamed: 0,street_address,owner_name,owner_mailing_address,city_state_zip,owner_full_mailing_address,property_full_address,number_properties_owned,units,g_code,latitude,longitude,list_properties_owned


In [45]:
errors_df

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
19294,10 PAULMIER PL.,"DOWLING, JAMES PATRICK & TOBY K.",10 PAULMIER PL,"JERSEY CITY, N J 07302","10 PAULMIER PL, JERSEY CITY, N J 07302","10 PAULMIER PL. Jersey City, NJ",1,1,
19691,280 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,280 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA Jersey City, NJ",2,1,
19692,270 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,270 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA Jersey City, NJ",2,1,
32583,32 WILKINSON ST.,"HARIKRISHAN, HARRY",23 TERHUNE AVE.,"JERSEY CITY, NJ 07305","23 TERHUNE AVE., JERSEY CITY, NJ 07305","32 WILKINSON ST. Jersey City, NJ",1,1,


## helper functions

In [None]:
# # number of times property appears in first column
# def propertyAppearances(address=str): 
#     return len(df[df.propertyLocation == address])
# #
# def getPropertiesOwned(owner=str):
#     return df[df.ownersName == owner]['propertyLocation'].unique().tolist()
# df['ownersFullMailingAddress'] = df['ownersMailingAddress'] + ', ' + df.cityStateZip
# df['propertyFullAddress'] = df.propertyLocation + ', ' + 'Jersey City, NJ'


In [None]:
geocode_errors = [i for i,e in enumerate(errors_df.gCode) if e == None]

In [None]:
errors_df.iloc[geocode_errors]

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
19294,10 PAULMIER PL.,"DOWLING, JAMES PATRICK & TOBY K.",10 PAULMIER PL,"JERSEY CITY, N J 07302","10 PAULMIER PL, JERSEY CITY, N J 07302","10 PAULMIER PL. Jersey City, NJ",1,1,
19691,280 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,280 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA Jersey City, NJ",2,1,
19692,270 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,270 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA Jersey City, NJ",2,1,
32583,32 WILKINSON ST.,"HARIKRISHAN, HARRY",23 TERHUNE AVE.,"JERSEY CITY, NJ 07305","23 TERHUNE AVE., JERSEY CITY, NJ 07305","32 WILKINSON ST. Jersey City, NJ",1,1,
38254,107 SHEARWATER CT.,"SANTANIELLO III, PHILLIP & CLAIRE",107 SHEARWATER CT.,"JERSEY CITY, NJ 07305","107 SHEARWATER CT., JERSEY CITY, NJ 07305","107 SHEARWATER CT. Jersey City, NJ",1,1,


In [None]:
jersey_city_replacement = {
    'TONNELE': 'TONNELLE',
    'MC ADOO': 'MCADOO',
    'M.L. KING DRIVE': 'MARTIN LUTHER KING DRIVE',
    'COLUMBUS': 'CHRISTOPHER COLUMBUS',
    'FIRST': '1st',
    'SECOND': '2nd',
    'THIRD': '3rd',
    'FOURTH': '4th',
    'FIFTH': '5th',
    'SIXTH': '6th',
    'SEVENTH': '7th',
    'EIGHTH': '8th',
    'NINETH': '9th',
    'TENTH': '10th',
    'NINTH': '9th',
    'FIFTEENTH': '15th',
    'SIXTEENTH': '16th',
    'MC DOUGALL': 'MCDOUGALL',
    'CARPENTIER': 'CARPENTER',
    'FOX HOUND': 'FOXHOUND',
    'SIEDLER': 'SEIDLER',
    'LIENAU': 'LINEAU',
    'KENNEDY': 'JOHN F KENNEDY',
    'LIBERTY ST.': 'LIBERTY AVE.',
    '-REAR': '',
    '(REAR,': '',
    '-FRONT': '',
    'HAMPTON CT.': 'HAMPTON COURT',
    'VARCK': 'VARICK',
    'THO': '',
    'AVE AVE': 'AVE',
    'TER': 'TR',
    '3633A': '3633',
    '3144A': '3144',
    'SHEARWATR': 'SHEARWATER',
    'PALUSL': "PAUL'S",
    'COURT TR': "CT",
    'EASTVIEW': "E VIEW",
    'M.L. KING DR.': 'MARTIN LUTHER KING DR.',
    ' 1ST FL.': '',
    ' SO.': '',
    ' N.': '',
    'ODGEN': 'OGDEN',
    'PATRSON': 'PATERSON',
    ' 61 COLES': '',
    'VAN HOUTEN ST.': 'VAN HOUTEN AVE.',
    'MC PHERSON ST.': 'MCPHERSON PL.',
    'MC PHERSON': 'MCPHERSON',
    'FIR ROAD': 'FIR ST.',
    'UNION AVE.': 'UNION ST.',
    'OLD BERGEN': 'OLD BERGEN AVE.',
    'AVE. AVE.': 'AVE.',
    'JACKSON AVE.': 'JACKSON ST.',
    '162 OLD BERGEN AVE. AVE.': '162 OLD BERGEN AVE.',
    'GLEN': 'GLENN',
    'GLENNN': 'GLENN',
    'DR.VE': 'DRIVE',
    '1 GLENNN LANE': '1 GLENN LANE',
    r'\(\,': '',
    r'-\d{2:}(.?\d{1}(A,?,?A?\b': '',
    r'-\d{2:}': '',
    r'\s#\d': '',
    'BUTTRNUT': 'BUTTERNUT',
    'CARPENTR': 'CARPENTER',
    '100 SHEARWATER CT.': '100 EAST SHEARWATER CT.',
    '78A RNE ST.': '78 THORNE ST.',
}


In [None]:
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('TONNELE', 'TONNELLE')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('MC ADOO', 'MCADOO')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('M.L. KING DRIVE', 'MARTIN LUTHER KING DRIVE')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('COLUMBUS', 'CHRISTOPHER COLUMBUS')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIRST', '1st')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SECOND', '2nd')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('THIRD', '3rd')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FOURTH', '4th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIFTH', '5th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SIXTH', '6th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SEVENTH', '7th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('EIGHTH', '8th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('NINETH', '9th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('TENTH', '10th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('NINTH', '9th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIFTEENTH', '15th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SIXTEENTH', '16th')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('MC DOUGALL', 'MCDOUGALL')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('CARPENTIER', 'CARPENTER')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FOX HOUND', 'FOXHOUND')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SIEDLER', 'SEIDLER')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('LIENAU', 'LINEAU')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('KENNEDY', 'JOHN F KENNEDY')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('LIBERTY ST.', 'LIBERTY AVE.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('-REAR', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('(REAR)', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('-FRONT', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('HAMPTON CT.', 'HAMPTON COURT')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('VARCK', 'VARICK')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('THO', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('AVE AVE', 'AVE')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('TER', 'TR')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('3633A', '3633')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('3144A', '3144')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SHEARWATR', 'SHEARWATER')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('PALUSL', "PAUL'S")
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('COURT TR', "CT")
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('EASTVIEW', "E VIEW")
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('M.L. KING DR.', 'MARTIN LUTHER KING DR.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(' 1ST FL.', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(' SO.', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(' N.', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('ODGEN', 'OGDEN')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('PATRSON', 'PATERSON')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(' 61 COLES', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('VAN HOUTEN ST.', 'VAN HOUTEN AVE.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('MC PHERSON ST.', 'MCPHERSON PL.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('MC PHERSON', 'MCPHERSON')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIR ROAD', 'FIR ST.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('UNION AVE.', 'UNION ST.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('OLD BERGEN', 'OLD BERGEN AVE.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('AVE. AVE.', 'AVE.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('JACKSON AVE.', 'JACKSON ST.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('162 OLD BERGEN AVE. AVE.', '162 OLD BERGEN AVE.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('GLEN', 'GLENN')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('GLENNN', 'GLENN')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('DR.VE', 'DRIVE')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('1 GLENNN LANE', '1 GLENN LANE')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(r'\(\)', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(r'-\d{2,}(.?\d{1}(A)?)?A?\b', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(r'-\d{2,}', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace(r'\s#\d', '')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('BUTTRNUT', 'BUTTERNUT')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('CARPENTR', 'CARPENTER')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('100 SHEARWATER CT.', '100 EAST SHEARWATER CT.')
# errors_df.propertyLocation = errors_df.propertyLocation.str.replace('78A RNE ST.', '78 THORNE ST.')

# overwriting past mistakes
errors_df.propertyFullAddress = errors_df.propertyLocation + ' Jersey City, NJ'

In [None]:
%%time
errors_df['gCode'] = errors_df.propertyFullAddress.apply(geolocator.geocode)

CPU times: user 21.3 ms, sys: 6.94 ms, total: 28.2 ms
Wall time: 5.52 s


In [None]:
geocode_errors = [i for i,e in enumerate(errors_df.gCode) if e == None]

In [None]:
errors_df.iloc[geocode_errors].propertyLocation.unique()

array(['10 PAULMIER PL.', '280 GREGORY PARK PLAZA',
       '270 GREGORY PARK PLAZA', '32 WILKINSON ST.'], dtype=object)

Checking to see how many unique properties remain in the errors dataframe:

In [None]:
len(errors_df.iloc[geocode_errors].propertyLocation.unique())

4

In [None]:
geocode_errors = [i for i,e in enumerate(errors_df.gCode) if e == None]

In [None]:
errors_df.iloc[geocode_errors]

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
19294,10 PAULMIER PL.,"DOWLING, JAMES PATRICK & TOBY K.",10 PAULMIER PL,"JERSEY CITY, N J 07302","10 PAULMIER PL, JERSEY CITY, N J 07302","10 PAULMIER PL. Jersey City, NJ",1,1,
19691,280 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,280 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA Jersey City, NJ",2,1,
19692,270 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,270 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA Jersey City, NJ",2,1,
32583,32 WILKINSON ST.,"HARIKRISHAN, HARRY",23 TERHUNE AVE.,"JERSEY CITY, NJ 07305","23 TERHUNE AVE., JERSEY CITY, NJ 07305","32 WILKINSON ST. Jersey City, NJ",1,1,


In [None]:
# reassigning the errors_df to the remaining errors
errors_df = errors_df.iloc[geocode_errors].copy()

In [None]:
errors_df

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
19294,10 PAULMIER PL.,"DOWLING, JAMES PATRICK & TOBY K.",10 PAULMIER PL,"JERSEY CITY, N J 07302","10 PAULMIER PL, JERSEY CITY, N J 07302","10 PAULMIER PL. Jersey City, NJ",1,1,
19691,280 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,280 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","280 GREGORY PARK PLAZA Jersey City, NJ",2,1,
19692,270 GREGORY PARK PLAZA,METROPOLIS TOWERS APT. CORP. MGT.,270 GREGORY PARK PLAZA,"JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA, JERSEY CITY, NJ 07302","270 GREGORY PARK PLAZA Jersey City, NJ",2,1,
32583,32 WILKINSON ST.,"HARIKRISHAN, HARRY",23 TERHUNE AVE.,"JERSEY CITY, NJ 07305","23 TERHUNE AVE., JERSEY CITY, NJ 07305","32 WILKINSON ST. Jersey City, NJ",1,1,


In [None]:
errors_df.to_pickle('errors_df.pkl')

In [None]:
df['latitude'] = [g.latitude for g in df.gCode]
df['longitude'] = [g.longitude for g in df.gCode]

In [None]:
errors_df.propertyLocation.unique()

array(['10 PAULMIER PL.', '280 GREGORY PARK PLAZA',
       '270 GREGORY PARK PLAZA', '32 WILKINSON ST.'], dtype=object)

In [None]:
errors_df['propertyFullAddress'] = errors_df.propertyLocation + ', Jersey City, NJ'

In [None]:
%%time
errors_df['gCode'] = errors_df.propertyFullAddress.apply(geolocator.geocode)

CPU times: user 16.2 ms, sys: 3.35 ms, total: 19.5 ms
Wall time: 2.84 s


In [None]:
# pickling the dataframe
df = pd.read_pickle('jersey_city.pkl')

In [None]:
# exporting dataframe to csv, but indicating that i'm only exporting private property
df.to_csv('jersey_city_private_property.csv')

In [None]:
df.columns

Index(['propertyLocation', 'ownersName', 'ownersMailingAddress',
       'cityStateZip', 'ownersFullMailingAddress', 'propertyFullAddress',
       'propertiesOwned', 'units', 'gCode', 'latitude', 'longitude'],
      dtype='object')