In [64]:
import glob
import numpy as np
import pandas as pd
import psycopg2

from fuzzywuzzy import fuzz
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sqlalchemy import create_engine, inspect

PSQL = 'postgres://{}@localhost:5432/rpred'

files = glob.glob("data/target_matching/*.csv")

Extract target data:
* Load feature table (full address).
* Load target table (partial address).
* Create name_addr column (business name + address) for each table.
* Match partial name_addr to full name_addr with fuzzywuzzy.
* Map full name_addr to target table.
* Save to Postgres and CSV.

In [66]:
def match(full_addr, addr, threshold=70):
    return fuzz.partial_ratio(full_addr, addr) > threshold;

In [67]:
cred = ""
with open("credentials/localhost/jessica.txt") as credfile:
    cred = credfile.read().strip("\n")
    
cnx = create_engine(PSQL.format(cred), isolation_level='AUTOCOMMIT')

y = pd.read_sql_query(
    '''SELECT name, address, is_open FROM yelp_dataset_11 ORDER BY name''', cnx)
X = pd.read_sql_query(
    '''SELECT name, full_address FROM yelp_dataset_8 ORDER BY name''', cnx)

In [68]:
X['full_address'] = X['full_address'].apply(lambda s: s.replace("\n", " "))
X['name_addr'] = X['name'] + " " + X['full_address']
X.head()

Unnamed: 0,name,full_address,name_addr
0,1000 Grammes,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes 1495 Rue Sainte-Catherine East Vi...
1,108 Chinese Take Away,108 Portobello High Street Edinburgh EH15 1AL,108 Chinese Take Away 108 Portobello High Stre...
2,10-to-10 In Delhi,67 Nicolson Street Newington Edinburgh EH8 9BZ,10-to-10 In Delhi 67 Nicolson Street Newington...
3,12 East Cafe,1153 E Jefferson St Phoenix AZ 85034,12 East Cafe 1153 E Jefferson St Phoenix AZ 8...
4,168 Market,3459 S Jones Blvd Chinatown Las Vegas NV 89146,168 Market 3459 S Jones Blvd Chinatown Las Veg...


In [69]:
y['name_addr'] = y['name'] + " " + y['address']
y.head()

Unnamed: 0,name,address,is_open,name_addr
0,00 Gelato,370 King Street W,1,00 Gelato 370 King Street W
1,0109 Dessert & Chocolate,2190 McNicoll Avenue Unit 109,1,0109 Dessert & Chocolate 2190 McNicoll Avenue ...
2,1000 Grammes,1495 Rue Sainte-Catherine East,1,1000 Grammes 1495 Rue Sainte-Catherine East
3,101 Bottles of Beer On the Wall,115 N Willow St,1,101 Bottles of Beer On the Wall 115 N Willow St
4,108 Chinese Take Away,108 Portobello High Street,1,108 Chinese Take Away 108 Portobello High Street


In [74]:
full_dict = X.set_index('name_addr').to_dict(orient='dict')['full_address']
partial_dict = y.set_index('name_addr').to_dict(orient='dict')['address']
print("full:", list(full_dict.items())[0])
print("partial:", list(partial_dict.items())[0])

full: ('1000 Grammes 1495 Rue Sainte-Catherine East Ville-Marie Montréal  QC H2L 2H9', '1495 Rue Sainte-Catherine East Ville-Marie Montréal  QC H2L 2H9')
partial: ('00 Gelato 370 King Street W', '370 King Street W')


In [76]:
matched = {}
partial_keys = sorted(list(partial.keys()))
full_keys = sorted(list(full.keys()))

partial = partial_keys.pop()
full = full_keys.pop()
# Iterate until run out of addresses to match.
while partial_keys and full_keys:
    if match(full, partial, 90):
        matched[partial] = full
        partial = partial_keys.pop()
        full = full_keys.pop()
    elif partial < full:
        full = full_keys.pop()
    else:
        partial = partial_keys.pop()
print(len(matched), "matched")
print(len(full_dict), "full addresses remaining")
print(len(partial_dict), "partial addresses remaining")

12806 matched
15341 full addresses remaining
33967 partial addresses remaining


In [85]:
str1 = "\n".join(["{},{}".format(i[0], i[1]) for i in matched.items()])
with open("data/target_matching/address_map.csv", "w") as file:
    file.write(str1)

In [81]:
y['full_name_addr'] = y['name_addr'].map(matched)
y.dropna(inplace=True)
y.head()

Unnamed: 0,name,address,is_open,name_addr,full_name_addr
2,1000 Grammes,1495 Rue Sainte-Catherine East,1,1000 Grammes 1495 Rue Sainte-Catherine East,1000 Grammes 1495 Rue Sainte-Catherine East Vi...
4,108 Chinese Take Away,108 Portobello High Street,1,108 Chinese Take Away 108 Portobello High Street,108 Chinese Take Away 108 Portobello High Stre...
8,12 East Cafe,1153 E Jefferson St,0,12 East Cafe 1153 E Jefferson St,12 East Cafe 1153 E Jefferson St Phoenix AZ 8...
11,168 Market,3459 S Jones Blvd,1,168 Market 3459 S Jones Blvd,168 Market 3459 S Jones Blvd Chinatown Las Veg...
12,17th Street Cafe,75 S 17th St,0,17th Street Cafe 75 S 17th St,17th Street Cafe 75 S 17th St South Side Pitts...


In [87]:
y_final = pd.merge(X, y, how='inner', left_on='name_addr', right_on='full_name_addr')
y_final.drop(columns=['name_x', 'name_addr_y', 'name_addr_x', 'address', 'full_name_addr'], inplace=True)
y_final.rename(columns={"name_y":"name"}, inplace=True)
y_final.head()

Unnamed: 0,full_address,name,is_open
0,1495 Rue Sainte-Catherine East Ville-Marie Mon...,1000 Grammes,1
1,108 Portobello High Street Edinburgh EH15 1AL,108 Chinese Take Away,1
2,1153 E Jefferson St Phoenix AZ 85034,12 East Cafe,0
3,3459 S Jones Blvd Chinatown Las Vegas NV 89146,168 Market,1
4,75 S 17th St South Side Pittsburgh PA 15203,17th Street Cafe,0


In [88]:
y_final.to_sql('target', conn, if_exists='replace', index=False)

NameError: name 'conn' is not defined