In [28]:
import pandas as pd
import geopandas as gpd
from fuzzywuzzy import fuzz, process
import numpy as np
from geopy.geocoders import Nominatim
import re

In [29]:
#reading in master data set, splitting it into residences and people
herve = pd.read_csv('../data/HERVE PLUS.csv', dtype={'PropZip':'object'})
for col in herve.columns:
    herve[col] = herve[col].replace('�',np.nan)
    herve[col] = herve[col].replace('	','')

residences = pd.DataFrame()
residences['address'] = herve['PropHouse'].str.strip().str.upper()+ ' ' +herve['PropStreet'].str.strip().str.upper()
residences['zip'] = herve['PropZip'].str.strip()

people = pd.DataFrame()
names = herve['Resident First'].str.strip().str.upper() + ' ' + herve['Resident Last'].str.strip().str.upper()
names = names.dropna().unique().tolist()
people['name'] = names

In [30]:
#using fuzzywuzzy process matching to find duplicates based on minimum-edit distance, then dropping these extras
matches = []

for name in people['name'].tolist():
    match = process.extract(name, people['name'].tolist())
    
    for each in match:
        if each[1] >= 90 and each[1] < 100:
            matches.append(each[0])
        else:
            continue

duplicates = [matches[i] for i in range(len(matches)) if i%2 == 1] 
people = people.query("name not in @duplicates")

In [4]:

residences.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1708 entries, 0 to 1707
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   address  1707 non-null   object
 1   zip      1245 non-null   object
dtypes: object(2)
memory usage: 26.8+ KB


In [5]:
#using nominatum to find missing zipcodes

residences['zip'].fillna('0',inplace=True)
residences = residences.drop_duplicates().dropna()
residences.loc[residences['zip']=='x','zip'] = '0'
residences['zip'] = residences['zip'].astype('int64')
geolocator = Nominatim(user_agent="capstone")

for ind, row in residences[residences['zip']==0].iterrows():
    partial_address = row['address']
    try:
        geo = geolocator.geocode(partial_address + ', Nashville')
        full_address = geo.address
        zipcode = re.search('(?<=, )[0-9]{5}', full_address)[0]
        residences.loc[residences['address']==row['address'], 'zip'] = zipcode
        residences.loc[residences['address']==row['address'], 'lat'] = geo.latitude
        residences.loc[residences['address']==row['address'], 'lon'] = geo.longitude
        
    except:
        continue
    

        
    

In [6]:

residences.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1679 entries, 0 to 1707
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   address  1679 non-null   object 
 1   zip      1679 non-null   object 
 2   lat      345 non-null    float64
 3   lon      345 non-null    float64
dtypes: float64(2), object(2)
memory usage: 65.6+ KB


In [7]:
residences['zip'].value_counts()

37211    443
37214    164
37211    133
0        107
37204    105
        ... 
37143      1
37208      1
54541      1
38475      1
37115      1
Name: zip, Length: 65, dtype: int64

In [8]:
residences.to_csv('../data/residences.csv')

In [2]:
res = pd.read_csv('../data/residences.csv', index_col = 0)

In [3]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1679 entries, 0 to 1707
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   address  1679 non-null   object 
 1   zip      1679 non-null   int64  
 2   lat      345 non-null    float64
 3   lon      345 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.6+ KB


In [None]:
#using nominatim to find coordinates

geolocator = Nominatim(user_agent="capstone")

for ind, row in res[res['lat'].isna()].iterrows():
    try:
        geo = geolocator.geocode(row['address'] + ', ' + str(row['zip']))
        res.loc[res['address']==row['address'], 'lat'] = geo.latitude
        res.loc[res['address']==row['address'], 'lon'] = geo.longitude
    except:
        continue

In [5]:
#still missing ~300 coordinates
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1679 entries, 0 to 1707
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   address  1679 non-null   object 
 1   zip      1679 non-null   int64  
 2   lat      1383 non-null   float64
 3   lon      1383 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.6+ KB


In [6]:
res.to_csv('../data/residences.csv')

In [7]:
res = pd.read_csv('../data/residences.csv')

In [8]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1674 entries, 0 to 1673
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1674 non-null   int64  
 1   address     1674 non-null   object 
 2   zip         1674 non-null   int64  
 3   lat         1417 non-null   float64
 4   lon         1417 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 65.5+ KB


In [10]:
res = res.drop(columns = 'Unnamed: 0')

In [11]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1674 entries, 0 to 1673
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   address  1674 non-null   object 
 1   zip      1674 non-null   int64  
 2   lat      1417 non-null   float64
 3   lon      1417 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 52.4+ KB


In [20]:
keys = pd.read_csv('../data/API.txt')

In [21]:
import googlemaps

gmap_key = keys.loc[keys['API']=='Google Maps', 'Key'].values[0]
gmaps = googlemaps.Client(key=gmap_key)

In [24]:
#using google maps API to find remaining coordinates
for ind, row in res[res['lat'].isna()].iterrows():
    address = row['address'] + ', ' + str(row['zip'])
    geo = gmaps.geocode(address)
    lat = geo[0]['geometry']['location']['lat']
    lon = geo[0]['geometry']['location']['lng']
    res.loc[res['address']==row['address'], 'lat'] = lat
    res.loc[res['address']==row['address'], 'lon'] = lon

In [25]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1674 entries, 0 to 1673
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   address  1674 non-null   object 
 1   zip      1674 non-null   int64  
 2   lat      1674 non-null   float64
 3   lon      1674 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 52.4+ KB


In [26]:
res.to_csv('../data/residences.csv')

In [31]:
people.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 537 entries, 0 to 538
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    537 non-null    object
dtypes: object(1)
memory usage: 8.4+ KB


In [32]:
people.to_csv('../data/people.csv')