Match target data to features and save merged table to SQL.
* Load feature table (full address).
* Load target table (partial address).
* Create name_addr column (business name + address) for each table.
* Match partial name_addr to full name_addr with fuzzywuzzy.
* Map full name_addr to target table.
* Save address mapping table to SQL.

In [2]:
import numpy as np
import pandas as pd
import psycopg2

from fuzzywuzzy import fuzz
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sqlalchemy import create_engine, inspect

PSQL = 'postgres://{}@localhost:5432/rpred'

In [3]:
def match(full_addr, addr, threshold=70):
    """Determine if addresses probably match, i.e. 123 Spring St --> 123 Spring Street Seattle WA 98108"""
    return fuzz.partial_ratio(full_addr, addr) > threshold;

In [4]:
cred = ""
with open("../credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

# Load target data
y = pd.read_sql_query(
    '''SELECT name, address, is_open FROM yelp_11 ORDER BY name''', cnx)

# Load feature data
X = pd.read_sql_query(
    '''SELECT name, full_address FROM yelp_8 ORDER BY name''', cnx)

assert len(X) > 100, "features table suspiciously small..."
assert len(y) > 100, "target table suspiciously small..."

In [5]:
# Strip newline from full address.
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))

# Create Dataframe columns for name+address keys
X['name_addr'] = X['name'] + " " + X['full_address']
y['name_addr'] = y['name'] + " " + y['address']

# Make dicts mapping keys to addresses. 
full_dict = X.set_index('name_addr').to_dict(orient='dict')['full_address']
partial_dict = y.set_index('name_addr').to_dict(orient='dict')['address']
print("full:", list(full_dict.items())[0])
print("partial:", list(partial_dict.items())[0])

full: ('100℃ 5600 Spring Mountain Rd Ste B Chinatown Las Vegas, NV 89146', '5600 Spring Mountain Rd Ste B Chinatown Las Vegas, NV 89146')
partial: ('00 Gelato 370 King Street W', '370 King Street W')


In [9]:
matched = {}
partial_keys = sorted(list(partial_dict.keys()))
full_keys = sorted(list(full_dict.keys()))

partial = partial_keys.pop()
full = full_keys.pop()
# Iterate until run out of addresses to match.
while partial_keys and full_keys:
    if match(full, partial, 90):
        matched[partial] = full
        partial = partial_keys.pop()
        full = full_keys.pop()
    elif partial < full:
        full = full_keys.pop()
    else:
        partial = partial_keys.pop()
print(len(matched), "matched")
print(len(full_dict), "full addresses remaining")
print(len(partial_dict), "partial addresses remaining")

27091 matched
32934 full addresses remaining
65683 partial addresses remaining


In [7]:
str1 = "\n".join(["{},{}".format(i[0], i[1]) for i in matched.items()])
with open("data/target_matching/address_map.csv", "w") as file:
    file.write(str1)

FileNotFoundError: [Errno 2] No such file or directory: 'data/target_matching/address_map.csv'

In [10]:
y['full_name_addr'] = y['name_addr'].map(matched)
y.dropna(inplace=True)
y.head()

Unnamed: 0,name,address,is_open,name_addr,full_name_addr
2,100℃,"5600 Spring Mountain Rd, Ste B",0,"100℃ 5600 Spring Mountain Rd, Ste B",100℃ 5600 Spring Mountain Rd Ste B Chinatown L...
5,1000 Degrees Neapolitan Pizzeria,7000 E Mayo Blvd,1,1000 Degrees Neapolitan Pizzeria 7000 E Mayo Blvd,1000 Degrees Neapolitan Pizzeria 7000 E Mayo B...
6,1000 Grammes,1495 Rue Sainte-Catherine East,1,1000 Grammes 1495 Rue Sainte-Catherine East,1000 Grammes 1495 Rue Sainte-Catherine East Vi...
10,100% Natural Mexican Grill,7455 Eastern Ave,0,100% Natural Mexican Grill 7455 Eastern Ave,100% Natural Mexican Grill 7455 Eastern Ave So...
13,101 Asian Buffet,20440 N 27th Ave,0,101 Asian Buffet 20440 N 27th Ave,"101 Asian Buffet 20440 N 27th Ave Phoenix, AZ ..."


In [11]:
columns = ['full_address', 'name_y', 'is_open']
y_final = pd.merge(X, y, how='inner', left_on='name_addr', right_on='full_name_addr')[columns]
y_final.rename(columns={"name_y":"name"}, inplace=True)
y_final.head()

Unnamed: 0,full_address,name,is_open
0,5600 Spring Mountain Rd Ste B Chinatown Las Ve...,100℃,0
1,"7000 E Mayo Blvd Phoenix, AZ 85054",1000 Degrees Neapolitan Pizzeria,1
2,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes,1
3,"7455 Eastern Ave Southeast Las Vegas, NV 89123",100% Natural Mexican Grill,0
4,"20440 N 27th Ave Phoenix, AZ 85027",101 Asian Buffet,0


In [12]:
y_final.to_sql('address_map', cnx, if_exists='replace', index=False)