## Cleaning and Normalization

In [1]:
import pandas as pd 
import numpy as np

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
# for returning lat long coordinates from addresses
geolocator = Nominatim(user_agent="myGeolocator", timeout=2)


# since we've pickled the dataframe after attempting to geocode…
# …we'll read the pickle and not the csv
#df = pd.read_csv('jersey_city_residential.csv')

df = pd.read_pickle('JC_residential.pkl')

In [6]:
df

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
0,677 LIBERTY AVE.,"PEDDI, PRADEEP",677 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","677 LIBERTY AVE., JERSEY CITY, N.J. 07307","677 LIBERTY AVE. Jersey City, NJ",1,1,"(677, Liberty Avenue, Jersey City, Hudson Coun..."
1,675 LIBERTY AVE.,"PAREJA, HENRY A. & MIRYAM C.",675 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","675 LIBERTY AVE., JERSEY CITY, N.J. 07307","675 LIBERTY AVE. Jersey City, NJ",1,1,"(675, Liberty Avenue, Jersey City, Hudson Coun..."
2,673 LIBERTY AVE.,"HIRPARA, PRAVIN",673 LIBERTY AVE.,"JERSEY CITY, NJ 07307","673 LIBERTY AVE., JERSEY CITY, NJ 07307","673 LIBERTY AVE. Jersey City, NJ",1,1,"(673, Liberty Avenue, Jersey City, Hudson Coun..."
3,671 LIBERTY AVE.,"SINGH, HONEY",671 LIBERTY AVE.,"JERSEY CITY, NJ 07307","671 LIBERTY AVE., JERSEY CITY, NJ 07307","671 LIBERTY AVE. Jersey City, NJ",1,1,"(671, Liberty Avenue, Jersey City, Hudson Coun..."
4,669 LIBERTY AVE.,"MARTIN, BENIGNO & CARIDAD",669 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","669 LIBERTY AVE., JERSEY CITY, N.J. 07307","669 LIBERTY AVE. Jersey City, NJ",1,1,"(669, Liberty Avenue, Jersey City, Hudson Coun..."
...,...,...,...,...,...,...,...,...,...
41799,98 GARFIELD AVE.,"SWINNEY, BRENDA",98 GARFIELD AVE.,"JERSEY CITY, N.J. 07305","98 GARFIELD AVE., JERSEY CITY, N.J. 07305","98 GARFIELD AVE. Jersey City, NJ",2,1,"(98, Garfield Avenue, Communipaw, Jersey City,..."
41800,100 GARFIELD AVE.,"SERVIDA, AUREA",100 GARFIELD AVE.,"JERSEY CITY, NJ 07305","100 GARFIELD AVE., JERSEY CITY, NJ 07305","100 GARFIELD AVE. Jersey City, NJ",1,1,"(100, Garfield Avenue, Jersey City, Hudson Cou..."
41801,102 GARFIELD AVE.,"DIAZ, ANGEL & GALADITA",5405 STEVEHAVEN LN.,"CUMMING , GA. 30028","5405 STEVEHAVEN LN., CUMMING , GA. 30028","102 GARFIELD AVE. Jersey City, NJ",1,1,"(102, Garfield Avenue, Jersey City, Hudson Cou..."
41802,104 GARFIELD AVE.,"RODRIGUEZ, BENJAMIN",104 GARFIELD AVE.,"JERSEY CITY, N.J. 07305","104 GARFIELD AVE., JERSEY CITY, N.J. 07305","104 GARFIELD AVE. Jersey City, NJ",1,1,"(104, Garfield Avenue, Jersey City, Hudson Cou..."


In [13]:
# number of times owner appears in second column
def propertiesOwned(owner=str):
    return len(df[df.ownersName == owner])

# number of times property appears in first column
def propertyAppearances(address=str): 
    return len(df[df.propertyLocation == address])


df['ownersFullMailingAddress'] = df['ownersMailingAddress'] + ', ' + df.cityStateZip
df['propertyFullAddress'] = df.propertyLocation + ' Jersey City, NJ'

# we've already added these columns, so we'll comment them out
#df['propertiesOwned'] = [propertiesOwned(owner) for owner in df.ownersName]
#df['units'] = [propertyAppearances(address) for address in df.propertyLocation]
#df['gCode'] = df.propertyFullAddress.apply(geolocator.geocode)

In [14]:
# number of addresses address which did not geocode
geocode_errors = [i for i,e in enumerate(df.gCode) if e == None]

In [15]:
# plug them into iloc to return a dataframe of errors
df.iloc[geocode_errors]['propertyFullAddress'].head(50)

12                653 LIBERTY ST. Jersey City, NJ
112     3709 JOHN F KENNEDY BLVD. Jersey City, NJ
113     3707 JOHN F KENNEDY BLVD. Jersey City, NJ
114     3705 JOHN F KENNEDY BLVD. Jersey City, NJ
115     3703 JOHN F KENNEDY BLVD. Jersey City, NJ
175     3697 JOHN F KENNEDY BLVD. Jersey City, NJ
177     3691 JOHN F KENNEDY BLVD. Jersey City, NJ
178     3687 JOHN F KENNEDY BLVD. Jersey City, NJ
179     3685 JOHN F KENNEDY BLVD. Jersey City, NJ
180     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
181     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
182     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
183     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
184     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
185     3683 JOHN F KENNEDY BLVD. Jersey City, NJ
263     3675 JOHN F KENNEDY BLVD. Jersey City, NJ
264     3673 JOHN F KENNEDY BLVD. Jersey City, NJ
265     3671 JOHN F KENNEDY BLVD. Jersey City, NJ
266     3669 JOHN F KENNEDY BLVD. Jersey City, NJ
268     3663 JOHN F KENNEDY BLVD. Jersey City, NJ


In [52]:
# the reason why this didn't gecode properly is because there is no
# Liberty Street in Jersey City
df.propertyFullAddress[12]

'653 LIBERTY ST. Jersey City, NJ'

In [56]:
geolocator.geocode('653 LIBERTY AVE, Jersey City, NJ')

Location(653, Liberty Avenue, Jersey City, Hudson County, New Jersey, 07307, United States, (40.75896345454545, -74.05096481818181, 0.0))

In [63]:
# There is no "Kennedy BLVD" but a John F Kennedy BLVD
df.propertyFullAddress[112:115]

112    3709 KENNEDY BLVD. Jersey City, NJ
113    3707 KENNEDY BLVD. Jersey City, NJ
114    3705 KENNEDY BLVD. Jersey City, NJ
Name: propertyFullAddress, dtype: object

In [71]:
#the hyphenated address threw off the geolocator
geolocator.geocode('274 COLUMBIA AVE Jersey City, NJ')

Location(274, Columbia Avenue, Jersey City, Hudson County, New Jersey, 07307, United States, (40.75704381395349, -74.05096265116279, 0.0))

In [8]:
# the geolocator needs to the full street name
geolocator.geocode('3709 JOHN F KENNEDY BLVD. Jersey City, NJ')

Location(John F. Kennedy Boulevard, Greenville, Jersey City, Hudson County, New Jersey, 07305, United States, (40.70124, -74.093112, 0.0))

In [None]:
df.propertyLocation = df.propertyLocation.str.replace('KENNEDY', 'JOHN F KENNEDY')

In [17]:
df.propertyLocation = df.propertyLocation.str.replace('LIBERTY ST.', 'LIBERTY AVE.')

  df.propertyLocation = df.propertyLocation.str.replace('LIBERTY ST.', 'LIBERTY AVE.')


In [20]:
df.iloc[geocode_errors]['gCode'] = df.iloc[geocode_errors]['propertyFullAddress'].apply(geolocator.geocode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [72]:
# pickling the dataframe to avoid waiting hours for the geocodes again
# df.to_pickle('JC_residential.pkl')

In [29]:
geocode_errors = [i for i,e in enumerate(df.gCode) if e == None]

In [31]:
len(geocode_errors)

4174

In [33]:
# since we got an error saying we're writing on the dataframe…
# we'll creata a separate df of the errors and merge with the original df later
errors_df = df.iloc[geocode_errors].copy()

In [34]:
errors_df['gCode'] = errors_df.propertyFullAddress.apply(geolocator.geocode)

In [66]:
geocode_errors = [i for i,e in enumerate(errors_df.gCode) if e == None]

In [39]:
# Yay! We've solved some errors!
len(geocode_errors)

3149

In [41]:
errors_df.iloc[geocode_errors].head(50)

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
12,653 LIBERTY AVE.,"CICCARELLI, MARC",308 LAGOON DR.,"COPIAQUE, NY 11726","308 LAGOON DR., COPIAQUE, NY 11726","653 LIBERTY ST. Jersey City, NJ",1,1,
648,274-76 COLUMBIA AVE,"PERSAUD, PURAN & ANJAALI",274-276 COLUMBIA AVENUE,"JERSEY CITY,N.J. 07307","274-276 COLUMBIA AVENUE, JERSEY CITY,N.J. 07307","274-76 COLUMBIA AVE Jersey City, NJ",6,1,
797,3633A JOHN F KENNEDY BLVD.,"SCALIA HOLDINGS, LLC",264 CENTRAL AVE.,"JERSEY CITY, NJ 07307","264 CENTRAL AVE., JERSEY CITY, NJ 07307","3633A JOHN F KENNEDY BLVD. Jersey City, NJ",5,1,
1667,1074-1080 SUMMIT AVE,"1074 SUMMIT, LLC.",10 FULTON ST,"WEEHAWKEN, NJ 07086","10 FULTON ST, WEEHAWKEN, NJ 07086","1074-1080 SUMMIT AVE Jersey City, NJ",1,1,
1821,19 PATERSON STREET -REAR,"BIRN, SUSAN",19 PATERSON STREET-REAR,"JERSEY CITY, N.J. 07307","19 PATERSON STREET-REAR, JERSEY CITY, N.J. 07307","19 PATERSON STREET -REAR Jersey City, NJ",1,1,
2529,78A-80 THORNE ST.,"MITTAL,CHITRA KOTHARI & DEEPAK",76 CONSTITUTION WAY,"JERSEY CITY, N.J. 07302","76 CONSTITUTION WAY, JERSEY CITY, N.J. 07302","78A-80 THORNE ST. Jersey City, NJ",1,1,
3427,430 ODGEN AVE.,"DEVIRGILLO, DAMIAN","430 OGDEN AVE., UNIT 3","JERSEY CITY, NJ 07307","430 OGDEN AVE., UNIT 3, JERSEY CITY, NJ 07307","430 ODGEN AVE. Jersey City, NJ",1,1,
3511,608 TONNELE AVE.,"ADVANCE TRANSIT, LLC.",608 TONNELE AVE.,"JERSEY CITY, NJ 07307","608 TONNELE AVE., JERSEY CITY, NJ 07307","608 TONNELE AVE. Jersey City, NJ",1,1,
3512,636 TONNELE AVE.,"OFRZ & OZAN SHELL, INC.",137-68 70TH AVE,"FLUSHING , N.Y. 11367","137-68 70TH AVE, FLUSHING , N.Y. 11367","636 TONNELE AVE. Jersey City, NJ",1,1,
4240,978-80 SUMMIT AVE.,978-980 SUMMIT AVE. KBB LLC.,978-980 SUMMIT AVE.,"JERSEY CITY, NJ 07302","978-980 SUMMIT AVE., JERSEY CITY, NJ 07302","978-80 SUMMIT AVE. Jersey City, NJ",1,1,


In [59]:
# There is no 'Tonnele Ave' – it's 'Tonnelle'
geolocator.geocode('LIENAU PL., Jersey City, NJ')

In [60]:
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('TONNELE', 'TONNELLE')

In [83]:
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('MC ADOO', 'MCADOO')

In [84]:
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('M.L. KING DRIVE', 'MARTIN LUTHER KING DRIVE')

  errors_df.propertyLocation = errors_df.propertyLocation.str.replace('M.L. KING DRIVE', 'MARTIN LUTHER KING DRIVE')


In [111]:
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('COLUMBUS', 'CHRISTOPHER COLUMBUS')

In [112]:
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIRST', '1st')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SECOND', '2nd')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('THIRD', '3rd')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FOURTH', '4th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('FIFTH', '5th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SIXTH', '6th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('SEVENTH', '7th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('EIGHTH', '8th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('NINETH', '9th')
errors_df.propertyLocation = errors_df.propertyLocation.str.replace('TENTH', '10th')

In [113]:
errors_df.propertyFullAddress = errors_df.propertyLocation + ', Jersey City, NJ'

In [114]:
errors_df['gCode'] = errors_df.propertyFullAddress.apply(geolocator.geocode)

In [115]:
geocode_errors = [i for i,e in enumerate(errors_df.gCode) if e == None]

In [116]:
errors_df.iloc[geocode_errors]

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units,gCode
648,274-76 COLUMBIA AVE,"PERSAUD, PURAN & ANJAALI",274-276 COLUMBIA AVENUE,"JERSEY CITY,N.J. 07307","274-276 COLUMBIA AVENUE, JERSEY CITY,N.J. 07307","274-76 COLUMBIA AVE, Jersey City, NJ",6,1,
797,3633A JOHN F KENNEDY BLVD.,"SCALIA HOLDINGS, LLC",264 CENTRAL AVE.,"JERSEY CITY, NJ 07307","264 CENTRAL AVE., JERSEY CITY, NJ 07307","3633A JOHN F KENNEDY BLVD., Jersey City, NJ",5,1,
1667,1074-1080 SUMMIT AVE,"1074 SUMMIT, LLC.",10 FULTON ST,"WEEHAWKEN, NJ 07086","10 FULTON ST, WEEHAWKEN, NJ 07086","1074-1080 SUMMIT AVE, Jersey City, NJ",1,1,
1821,19 PATERSON STREET -REAR,"BIRN, SUSAN",19 PATERSON STREET-REAR,"JERSEY CITY, N.J. 07307","19 PATERSON STREET-REAR, JERSEY CITY, N.J. 07307","19 PATERSON STREET -REAR, Jersey City, NJ",1,1,
2529,78A-80 THORNE ST.,"MITTAL,CHITRA KOTHARI & DEEPAK",76 CONSTITUTION WAY,"JERSEY CITY, N.J. 07302","76 CONSTITUTION WAY, JERSEY CITY, N.J. 07302","78A-80 THORNE ST., Jersey City, NJ",1,1,
...,...,...,...,...,...,...,...,...,...
38815,4 CONSTELLATION PL.,"PRAKASH, NIKHIL","4 CONSTELLATION PL.,#404","JERSEY CITY, NJ 07305","4 CONSTELLATION PL.,#404, JERSEY CITY, NJ 07305","4 CONSTELLATION PL., Jersey City, NJ",1,29,
38817,4 CONSTELLATION PL.,"DOBSON, MABLE E & TANYA",2 BUCKS LN.,"MARLBORO, NJ 07746","2 BUCKS LN., MARLBORO, NJ 07746","4 CONSTELLATION PL., Jersey City, NJ",1,29,
38820,4 CONSTELLATION PL.,"BANDYOPADHYAY, RAHUL",806 MORRIS TPKE APT 1A,"SHORT HILLS, NJ 07078","806 MORRIS TPKE APT 1A, SHORT HILLS, NJ 07078","4 CONSTELLATION PL., Jersey City, NJ",1,29,
40216,12 DANFORTH AVE. THO,"RIVERA, CARMEN & MAGDALENA",12 DANFORTH AVE.,"JERSEY CITY, NJ 07305","12 DANFORTH AVE., JERSEY CITY, NJ 07305","12 DANFORTH AVE. THO, Jersey City, NJ",1,1,


In [117]:
errors_df.to_pickle('errors_df.pkl')