## Cleaning and Normalization

In [1]:
import pandas as pd 
import numpy as np

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
# for returning lat long coordinates from addresses
geolocator = Nominatim(user_agent="myGeolocator", timeout=2)

df = pd.read_csv('jersey_city_residential.csv')

In [2]:
df

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip
0,677 LIBERTY AVE.,"PEDDI, PRADEEP",677 LIBERTY AVE.,"JERSEY CITY, N.J. 07307"
1,675 LIBERTY AVE.,"PAREJA, HENRY A. & MIRYAM C.",675 LIBERTY AVE.,"JERSEY CITY, N.J. 07307"
2,673 LIBERTY AVE.,"HIRPARA, PRAVIN",673 LIBERTY AVE.,"JERSEY CITY, NJ 07307"
3,671 LIBERTY AVE.,"SINGH, HONEY",671 LIBERTY AVE.,"JERSEY CITY, NJ 07307"
4,669 LIBERTY AVE.,"MARTIN, BENIGNO & CARIDAD",669 LIBERTY AVE.,"JERSEY CITY, N.J. 07307"
...,...,...,...,...
41799,98 GARFIELD AVE.,"SWINNEY, BRENDA",98 GARFIELD AVE.,"JERSEY CITY, N.J. 07305"
41800,100 GARFIELD AVE.,"SERVIDA, AUREA",100 GARFIELD AVE.,"JERSEY CITY, NJ 07305"
41801,102 GARFIELD AVE.,"DIAZ, ANGEL & GALADITA",5405 STEVEHAVEN LN.,"CUMMING , GA. 30028"
41802,104 GARFIELD AVE.,"RODRIGUEZ, BENJAMIN",104 GARFIELD AVE.,"JERSEY CITY, N.J. 07305"


In [3]:
# number of times property appears in first column
def propertyAppearances(address=str): 
    return len(df[df.propertyLocation == address])

In [4]:
# number of times owner appears in second column
def propertiesOwned(owner=str):
    return len(df[df.ownersName == owner])

In [5]:
df['ownersFullMailingAddress'] = df['ownersMailingAddress'] + ', ' + df.cityStateZip
df['propertyFullAddress'] = df.propertyLocation + ' Jersey City, NJ'

In [6]:
%%time 
df['propertiesOwned'] = [propertiesOwned(owner) for owner in df.ownersName]
df['units'] = [propertyAppearances(address) for address in df.propertyLocation]

CPU times: user 3min 23s, sys: 2.84 s, total: 3min 26s
Wall time: 3min 29s


In [7]:
df.columns

Index(['propertyLocation', 'ownersName', 'ownersMailingAddress',
       'cityStateZip', 'ownersFullMailingAddress', 'propertyFullAddress',
       'propertiesOwned', 'units'],
      dtype='object')

In [8]:
df

Unnamed: 0,propertyLocation,ownersName,ownersMailingAddress,cityStateZip,ownersFullMailingAddress,propertyFullAddress,propertiesOwned,units
0,677 LIBERTY AVE.,"PEDDI, PRADEEP",677 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","677 LIBERTY AVE., JERSEY CITY, N.J. 07307","677 LIBERTY AVE. Jersey City, NJ",1,1
1,675 LIBERTY AVE.,"PAREJA, HENRY A. & MIRYAM C.",675 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","675 LIBERTY AVE., JERSEY CITY, N.J. 07307","675 LIBERTY AVE. Jersey City, NJ",1,1
2,673 LIBERTY AVE.,"HIRPARA, PRAVIN",673 LIBERTY AVE.,"JERSEY CITY, NJ 07307","673 LIBERTY AVE., JERSEY CITY, NJ 07307","673 LIBERTY AVE. Jersey City, NJ",1,1
3,671 LIBERTY AVE.,"SINGH, HONEY",671 LIBERTY AVE.,"JERSEY CITY, NJ 07307","671 LIBERTY AVE., JERSEY CITY, NJ 07307","671 LIBERTY AVE. Jersey City, NJ",1,1
4,669 LIBERTY AVE.,"MARTIN, BENIGNO & CARIDAD",669 LIBERTY AVE.,"JERSEY CITY, N.J. 07307","669 LIBERTY AVE., JERSEY CITY, N.J. 07307","669 LIBERTY AVE. Jersey City, NJ",1,1
...,...,...,...,...,...,...,...,...
41799,98 GARFIELD AVE.,"SWINNEY, BRENDA",98 GARFIELD AVE.,"JERSEY CITY, N.J. 07305","98 GARFIELD AVE., JERSEY CITY, N.J. 07305","98 GARFIELD AVE. Jersey City, NJ",2,1
41800,100 GARFIELD AVE.,"SERVIDA, AUREA",100 GARFIELD AVE.,"JERSEY CITY, NJ 07305","100 GARFIELD AVE., JERSEY CITY, NJ 07305","100 GARFIELD AVE. Jersey City, NJ",1,1
41801,102 GARFIELD AVE.,"DIAZ, ANGEL & GALADITA",5405 STEVEHAVEN LN.,"CUMMING , GA. 30028","5405 STEVEHAVEN LN., CUMMING , GA. 30028","102 GARFIELD AVE. Jersey City, NJ",1,1
41802,104 GARFIELD AVE.,"RODRIGUEZ, BENJAMIN",104 GARFIELD AVE.,"JERSEY CITY, N.J. 07305","104 GARFIELD AVE., JERSEY CITY, N.J. 07305","104 GARFIELD AVE. Jersey City, NJ",1,1


In [9]:
df['gCode'] = df.propertyFullAddress.apply(geolocator.geocode)

In [58]:
# number of addresses address which did not geocode
geocode_errors = [i for i,e in enumerate(df.gCode) if e == None]

In [70]:
# plug them into iloc to return a dataframe of errors
df.iloc[geocode_errors]['propertyFullAddress'].head(50)

12         653 LIBERTY ST. Jersey City, NJ
112     3709 KENNEDY BLVD. Jersey City, NJ
113     3707 KENNEDY BLVD. Jersey City, NJ
114     3705 KENNEDY BLVD. Jersey City, NJ
115     3703 KENNEDY BLVD. Jersey City, NJ
175     3697 KENNEDY BLVD. Jersey City, NJ
177     3691 KENNEDY BLVD. Jersey City, NJ
178     3687 KENNEDY BLVD. Jersey City, NJ
179     3685 KENNEDY BLVD. Jersey City, NJ
180     3683 KENNEDY BLVD. Jersey City, NJ
181     3683 KENNEDY BLVD. Jersey City, NJ
182     3683 KENNEDY BLVD. Jersey City, NJ
183     3683 KENNEDY BLVD. Jersey City, NJ
184     3683 KENNEDY BLVD. Jersey City, NJ
185     3683 KENNEDY BLVD. Jersey City, NJ
263     3675 KENNEDY BLVD. Jersey City, NJ
264     3673 KENNEDY BLVD. Jersey City, NJ
265     3671 KENNEDY BLVD. Jersey City, NJ
266     3669 KENNEDY BLVD. Jersey City, NJ
268     3663 KENNEDY BLVD. Jersey City, NJ
347     3702 KENNEDY BLVD. Jersey City, NJ
348     3704 KENNEDY BLVD. Jersey City, NJ
349     3706 KENNEDY BLVD. Jersey City, NJ
350     370

In [52]:
# the reason why this didn't gecode properly is because there is no
# Liberty Street in Jersey City
df.propertyFullAddress[12]

'653 LIBERTY ST. Jersey City, NJ'

In [56]:
geolocator.geocode('653 LIBERTY AVE, Jersey City, NJ')

Location(653, Liberty Avenue, Jersey City, Hudson County, New Jersey, 07307, United States, (40.75896345454545, -74.05096481818181, 0.0))

In [63]:
# There is no "Kennedy BLVD" but a John F Kennedy BLVD
df.propertyFullAddress[112:115]

112    3709 KENNEDY BLVD. Jersey City, NJ
113    3707 KENNEDY BLVD. Jersey City, NJ
114    3705 KENNEDY BLVD. Jersey City, NJ
Name: propertyFullAddress, dtype: object

In [71]:
#the hyphenated address threw off the geolocator
geolocator.geocode('274 COLUMBIA AVE Jersey City, NJ')

Location(274, Columbia Avenue, Jersey City, Hudson County, New Jersey, 07307, United States, (40.75704381395349, -74.05096265116279, 0.0))

In [62]:
df.propertyFullAddress[111]

'795 SECAUCUS RD. Jersey City, NJ'

In [57]:
df.propertyFullAddress.head(50)

0      677 LIBERTY AVE. Jersey City, NJ
1      675 LIBERTY AVE. Jersey City, NJ
2      673 LIBERTY AVE. Jersey City, NJ
3      671 LIBERTY AVE. Jersey City, NJ
4      669 LIBERTY AVE. Jersey City, NJ
5      667 LIBERTY AVE. Jersey City, NJ
6      665 LIBERTY AVE. Jersey City, NJ
7      663 LIBERTY AVE. Jersey City, NJ
8      661 LIBERTY AVE. Jersey City, NJ
9      659 LIBERTY AVE. Jersey City, NJ
10     657 LIBERTY AVE. Jersey City, NJ
11     655 LIBERTY AVE. Jersey City, NJ
12      653 LIBERTY ST. Jersey City, NJ
13     651 LIBERTY AVE. Jersey City, NJ
14     649 LIBERTY AVE. Jersey City, NJ
15     647 LIBERTY AVE. Jersey City, NJ
16     645 LIBERTY AVE. Jersey City, NJ
17     643 LIBERTY AVE. Jersey City, NJ
18     641 LIBERTY AVE. Jersey City, NJ
19     639 LIBERTY AVE. Jersey City, NJ
20      637 LIBERTY AVE Jersey City, NJ
21     337 COLUMBIA AVE Jersey City, NJ
22    335 COLUMBIA AVE. Jersey City, NJ
23    333 COLUMBIA AVE. Jersey City, NJ
24    331 COLUMBIA AVE. Jersey City, NJ


In [72]:
df.to_pickle('JC_residential.pkl')