## Weighted Rank Model v2
#### - Input data is Tier 1 matching outputps
- gov_esri_for_tier2.csv
- gov_yelp_for_tier2.csv
- yelp_esri_for_tier2.csv

### Load libs

In [1]:
import pandas as pd
import numpy as np
import logging
import csv
from tqdm.notebook import tqdm
from geopy.distance import great_circle
import Levenshtein as lev
from metaphone import doublemetaphone

import textdistance

pd.set_option('display.width', 800)

---
# 1. Execute Weighted Rank Model to get One-Directional Mathcing results for { Base->Target }
1. The model return non-symmetric matching 
1. Results are only for Base->Target
1. E.g.  Yelp->Gov is not equal to Gov->Yelp

### 1.a. Define Functions for Matching

In [2]:
### Filter results by ditance from base business (r in meters)
def filter_byDist(row, lat_P0, lon_P0, LAT_name, LON_name, r):
    P0 = (lat_P0, lon_P0)
    Pi = (row[LAT_name],row[LON_name])    
    if great_circle(P0, Pi).meters < r:
        return True
    else:
        return False

    
### Function to calculate the levenshtein distance of doublemetaphone between business
def get_DM_DIST(name_P0, name_Pi):
    Code_P0 = doublemetaphone(name_P0)
    Code_Pi = doublemetaphone(name_Pi)
    DM_dist = 0
    for i in range(len(Code_P0)):
        for j in range(len(Code_Pi)):
            if (len(Code_P0[i]) > 0) and (len(Code_Pi[j]) > 0):
                DM_dist += textdistance.levenshtein.distance(Code_P0[i],Code_Pi[j])
    return DM_dist


### Weighted rank model to get ranked results of input business 
def matching_weighted_3_methods (df_base, feature_index, df_target, search_radius,
                         column_LAT_base, column_LON_base, column_BizName_base,
                         column_LAT_target, column_LON_target, column_BizName_target):    
    #### Calculate DIST & RANKING
    lat_P0 = df_base.iloc[feature_index][column_LAT_base]
    lon_P0 = df_base.iloc[feature_index][column_LON_base]
    df_match = df_target.copy()[df_target.apply(filter_byDist, args = (lat_P0, lon_P0, column_LAT_target, column_LON_target, search_radius), axis=1)]
    
    if len(df_match) > 0:
        df_match['DIST'] = df_match.copy().apply(lambda x: round(great_circle((lat_P0,lon_P0),(x[column_LAT_target],x[column_LON_target])).meters,2), axis = 1)
        df_match['DIST_rank'] = df_match.copy()['DIST'].rank(ascending=True, method='min')
        #### Calculate LEV & RANKING
        Name_P0 = df_base.iloc[feature_index][column_BizName_base]
        df_match['LEV'] = df_match.copy().apply(lambda x: lev.distance(Name_P0,  x[column_BizName_target]), axis=1)
        df_match['LEV_rank'] = df_match.copy()['LEV'].rank(ascending=True, method='min')
        #### Calculate DM & RANKING
        df_match['DM'] = df_match.copy().apply(lambda x: get_DM_DIST(Name_P0, x[column_BizName_target]), axis=1)
        df_match['DM_rank'] = df_match.copy()['DM'].rank(ascending=True, method='min')
        #### Calculate Weighted-Model & Ranking
        # < Model = 0.680 Lev + 0.112 DM+ 0.208 Dist >
        df_match['Model_score'] = df_match.copy().apply(lambda x: (0.680*x['LEV_rank']) + (0.112*x['DM_rank']) + (0.208*x['DIST_rank']), axis=1)
        df_match['Model_rank'] = df_match.copy()['Model_score'].rank(ascending=True, method='min')
    return df_match

### 1.b. Read and review tier-1 matching results

In [23]:
### Read tier 1 matching results
df_tier1_results_gov = pd.read_csv("gov_for_tier2_0512.csv")
df_tier1_results_esri = pd.read_csv("esri_for_tier2_0512.csv")
df_tier1_results_yelp = pd.read_csv("yelp_for_tier2_0512.csv")

In [24]:
df_tier1_results_gov.info()
df_tier1_results_esri.info()
df_tier1_results_yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4650 entries, 0 to 4649
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   G_RECORDID  4650 non-null   object 
 1   G_NAME      4650 non-null   object 
 2   G_LAT       4650 non-null   float64
 3   G_LONG      4650 non-null   float64
dtypes: float64(2), object(2)
memory usage: 145.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5966 entries, 0 to 5965
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   E_RECORDID  5966 non-null   int64  
 1   E_NAME      5966 non-null   object 
 2   E_LAT       5966 non-null   float64
 3   E_LONG      5966 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 186.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7288 entries, 0 to 7287
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------

In [25]:
# ### Drop some unused columns
# df_tier1_results_gov.drop(df_tier1_results_gov.iloc[:, 0:1], inplace=True, axis=1)
# df_tier1_results_esri.drop(df_tier1_results_esri.iloc[:, 0:1], inplace=True, axis=1)
# df_tier1_results_yelp.drop(df_tier1_results_yelp.iloc[:, 0:1], inplace=True, axis=1)

In [26]:
### Check input business sizes
print (df_tier1_results_gov.shape)
print (df_tier1_results_esri.shape)
print (df_tier1_results_yelp.shape)

(4650, 4)
(5966, 4)
(7288, 4)


In [27]:
### Set search distance from the base business
search_radius = 500     ### Change Me !!

### Set base and target dataset name 
# -- 'G' for Gov dataset
# -- 'E' for Esri dataset
# -- 'Y' for Yelp
base_dataset = 'Y'      ### Change Me !!
target_dataset = 'G'    ### Change Me !!
target_dataset_2 = 'E'  ### Change Me !!

### Define column names according to the base & target datasets
column_ID_base = base_dataset + '_RECORDID'
column_BizName_base = base_dataset + '_NAME'
column_LAT_base = base_dataset + '_LAT'
column_LON_base = base_dataset + '_LONG'

column_ID_target = target_dataset + '_RECORDID'
column_BizName_target = target_dataset + '_NAME'
column_LAT_target = target_dataset + '_LAT'
column_LON_target =  target_dataset + '_LONG'

column_ID_target_2 = target_dataset_2 + '_RECORDID'
column_BizName_target_2 = target_dataset_2 + '_NAME'
column_LAT_target_2 = target_dataset_2 + '_LAT'
column_LON_target_2 =  target_dataset_2 + '_LONG'

In [28]:
### Set datasets 
if base_dataset == 'G':
    df_tier1_results = df_tier1_results_gov
    df_target_dataset = df_tier1_results_esri
    df_target_dataset_2 = df_tier1_results_yelp
elif base_dataset == 'E':
    df_tier1_results = df_tier1_results_esri
    df_target_dataset = df_tier1_results_gov
    df_target_dataset_2 = df_tier1_results_yelp
elif base_dataset == 'Y':
    df_tier1_results = df_tier1_results_yelp
    df_target_dataset = df_tier1_results_gov
    df_target_dataset_2 = df_tier1_results_esri

### Get a list of unique business from base dataset
base_ID_list = pd.unique(df_tier1_results[column_ID_base]).tolist()
print (f'Total of ({len(base_ID_list)}) unique businesses from the base dataset ({base_dataset}) to match.')

Total of (7288) unique businesses from the base dataset (Y) to match.


In [29]:
### Get base and target sizes
print (f'Base:{base_dataset}/{len(df_tier1_results)} --- Target:   {target_dataset}/{len(df_target_dataset)}')
print (f'Base:{base_dataset}/{len(df_tier1_results)} --- Target_2: {target_dataset_2}/{len(df_target_dataset_2)}')
print ('--'*20)

### Check if any datasets (base, target, target_2) has NaN rows
print(len(df_tier1_results.loc[df_tier1_results[column_ID_base].isna()]))
print(len(df_target_dataset.loc[df_target_dataset[column_ID_target].isna()]))
print(len(df_target_dataset_2.loc[df_target_dataset_2[column_ID_target_2].isna()]))

Base:Y/7288 --- Target:   G/4650
Base:Y/7288 --- Target_2: E/5966
----------------------------------------
0
0
0


### 1.c. Get Ranked Results from Model

In [30]:
frames_matched_target   = []
frames_matched_target_2 = []

# for biz in tqdm(base_ID_list[:2]):
for biz in tqdm(base_ID_list):
    ### Get a subset of the tier 1 matched results 
    business_to_rank = df_tier1_results.copy().loc[df_tier1_results[column_ID_base] == biz]
    
    ######################################
    ##### ---- Base to Target ---- #######
    ######################################
    ## Get highest ranked business as tier 2 reults  (base->target)
    results_with_target = matching_weighted_3_methods (business_to_rank, 0, df_target_dataset, search_radius,
                                                       column_LAT_base, column_LON_base, column_BizName_base,
                                                       column_LAT_target, column_LON_target, column_BizName_target)
    
    ### Get highest rank results (target)
    if (results_with_target is not None) & (len(results_with_target) > 0):
        top_rank = results_with_target[results_with_target.Model_rank == 1]
        
        if len(top_rank)>1:
            ### Drop results with the same model ranking (Model_rank == 1), based on "target", keep record(s) by the lowest values of the following order:
            # 1. weighted rank model score ('Model_score')
            # 2. levenshtein distance between the business names ('LEV')
            # 3. distance between business locations ('DIST')
            # 4. levenshtein distance between the double metaphone of the business names ('DM')            
            # 5. Smallest business ID            
            top_rank = top_rank.sort_values(by = ['Model_score', 'LEV','DIST','DM',column_ID_target]).head(1)
#             top_rank = top_rank.loc[top_rank.sort_values(['Model_score', 'LEV','DIST','DM',column_ID_target]).groupby(['Model_rank'])[column_ID_target].idxmin()]
        
        temp_output = pd.concat([business_to_rank.reset_index(drop=True), top_rank.reset_index(drop=True)], axis=1)
        frames_matched_target.append(temp_output)
    
    ######################################
    ##### ---- Base to Target_2 ---- #####
    ######################################
    
    ## Get highest ranked business as tier 2 reults  (base->target_2)
    results_with_target_2 = matching_weighted_3_methods (business_to_rank, 0, df_target_dataset_2, search_radius,
                                                       column_LAT_base, column_LON_base, column_BizName_base,
                                                       column_LAT_target_2, column_LON_target_2, column_BizName_target_2)
    ### Get highest rank results (target_2)
    if (results_with_target_2 is not None) & (len(results_with_target_2) > 0):
        top_rank_2 = results_with_target_2[results_with_target_2.Model_rank == 1]
        
        if len(top_rank_2)>1:
            ### Drop results with the same model ranking (Model_rank == 1), based on "target", keep record(s) by the lowest values of the following order:
            # 1. weighted rank model score ('Model_score')
            # 2. levenshtein distance between the business names ('LEV')
            # 3. distance between business locations ('DIST')
            # 4. levenshtein distance between the double metaphone of the business names ('DM')            
            # 5. Smallest business ID            
            top_rank_2 = top_rank_2.sort_values(by = ['Model_score', 'LEV','DIST','DM',column_ID_target_2]).head(1)
#             top_rank_2 = top_rank_2.loc[top_rank_2.sort_values(['Model_score', 'LEV','DIST','DM',column_ID_target_2]).groupby(['Model_rank'])[column_ID_target_2].idxmin()]
        
        temp_output_2 = pd.concat([business_to_rank.reset_index(drop=True), top_rank_2.reset_index(drop=True)], axis=1)
        frames_matched_target_2.append(temp_output_2)

  0%|          | 0/7288 [00:00<?, ?it/s]

In [None]:
# match_results_base_target   = pd.concat(frames_matched_target)
# match_results_base_target.head()

In [31]:
match_results_base_target   = pd.concat(frames_matched_target)

print ('Base --> Target')
print (len(match_results_base_target))
print (match_results_base_target[column_ID_base].nunique())

match_results_base_target_2 = pd.concat(frames_matched_target_2)

print('--'*20)

print ('Base --> Target_2') 
print (len(match_results_base_target_2))
print (match_results_base_target_2[column_ID_base].nunique())

Base --> Target
6514
6514
----------------------------------------
Base --> Target_2
6767
6767


In [32]:
### Get Matched Business IDs

### Write matching output (Business IDs only) to file - Target 1
outputFileName = f'Tier2_WeightedRankModel_v4_{base_dataset}_to_{target_dataset}.csv'
match_results_base_target.to_csv(outputFileName, index=False)

### Write matching output (Business IDs only) to file - Target 2
outputFileName = f'Tier2_WeightedRankModel_v4_{base_dataset}_to_{target_dataset_2}.csv'
match_results_base_target_2.to_csv(outputFileName, index=False)

---
# 2. Getting best-available 3-way match from Weighted Rank Model outcomes 

### 2.a. Load single-directional results from weighted rank model

In [33]:
cols_GE = ['G_NAME','G_RECORDID','E_NAME','E_RECORDID']
cols_EG = ['E_NAME','E_RECORDID','G_NAME','G_RECORDID']
cols_GY = ['G_NAME','G_RECORDID','Y_NAME','Y_RECORDID']
cols_YG = ['Y_NAME','Y_RECORDID','G_NAME','G_RECORDID']
cols_EY = ['E_NAME','E_RECORDID','Y_NAME','Y_RECORDID']
cols_YE = ['Y_NAME','Y_RECORDID','E_NAME','E_RECORDID']

gov_esri_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_G_to_E.csv')
esri_gov_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_E_to_G.csv')

gov_yelp_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_G_to_Y.csv')
yelp_gov_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_Y_to_G.csv')

esri_yelp_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_E_to_Y.csv')
yelp_esri_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_Y_to_E.csv')

In [9]:
gov_esri_match_final

Unnamed: 0,G_RECORDID,G_NAME,G_LAT,G_LONG,E_RECORDID,E_NAME,E_LAT,E_LONG,DIST,DIST_rank,LEV,LEV_rank,DM,DM_rank,Model_score,Model_rank
0,DEH2014-FFPP-004032,wetzels pretzels,32.544055,-117.040592,619417843,wetzel's pretzels,32.5454,-117.0405,149.75,2.0,1,1.0,6,1.0,1.208,1.0
1,DEH2017-FFPN-001194,dollar tree,32.550069,-117.035942,722351465,food shop,32.5518,-117.0394,376.95,6.0,9,1.0,5,1.0,2.040,1.0
2,DEH2014-FFPP-003695,panda express,32.544620,-117.045357,657532354,mongus grill,32.5454,-117.0405,463.49,3.0,12,1.0,4,1.0,1.416,1.0
3,DEH2016-FFMP-001030,la crepe,32.546405,-117.041865,707802271,tokyosan,32.5454,-117.0405,169.85,2.0,8,3.0,3,1.0,2.568,1.0
4,DEH2013-FFPN-000457,snack stop 2,32.545093,-117.042201,210751962,rice garden,32.5454,-117.0405,163.01,2.0,10,2.0,10,2.0,2.000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4249,DEH2005-FFPP-415323,alpine fitness,32.838306,-116.780027,719121922,phillips 66,32.8381,-116.7787,126.04,2.0,12,2.0,5,3.0,2.112,1.0
4250,DEH2008-FFPP-431365,panda machi chinese & japanese cuisine,32.838382,-116.777007,495705014,mananas no 1 mexican food,32.8384,-116.7770,2.08,1.0,28,1.0,22,2.0,1.112,1.0
4251,DEH2014-FFPP-003381,francos flapjack breakfast house,32.838382,-116.777007,495705014,mananas no 1 mexican food,32.8384,-116.7770,2.08,1.0,24,1.0,12,2.0,1.112,1.0
4252,DEH2016-FFPP-006945,ayres lodge alpine,32.838677,-116.777946,416044582,greek village grill,32.8384,-116.7770,93.59,1.0,13,1.0,13,9.0,1.896,1.0


In [10]:
esri_gov_match_final

Unnamed: 0,E_RECORDID,E_NAME,E_LAT,E_LONG,G_RECORDID,G_NAME,G_LAT,G_LONG,DIST,DIST_rank,LEV,LEV_rank,DM,DM_rank,Model_score,Model_rank
0,210751962,rice garden,32.5454,-117.0405,DEH2014-FFPP-003704,godiva,32.544055,-117.040592,149.75,3.0,10,2.0,4,1.0,2.096,1.0
1,400287952,duty free americas,32.5439,-117.0374,DEH2013-FFPN-000298,dollar tree 5182,32.544465,-117.036618,96.57,2.0,14,1.0,5,1.0,1.208,1.0
2,400642806,cvs/pharmacy,32.5499,-117.0371,DEH2017-FFPN-001194,dollar tree,32.550069,-117.035942,110.17,2.0,11,2.0,7,2.0,2.000,1.0
3,401770221,subway,32.5458,-117.0398,DEH2014-FFPP-003704,godiva,32.544055,-117.040592,207.70,7.0,6,2.0,3,2.0,3.040,1.0
4,402942621,villa fresh italian kitchen,32.5454,-117.0405,DEH2003-FFPN-408283,marshalls,32.545783,-117.038106,228.41,9.0,20,2.0,8,1.0,3.344,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5464,172795940,gourmet outfitters,33.2565,-116.3807,DEH2002-FFPP-390807,kendalls cafe,33.257439,-116.381115,111.28,1.0,16,1.0,7,1.0,1.000,1.0
5465,245860176,red ocotilo,33.2685,-116.4006,DEH2002-FFPP-391359,the palms at indian head,33.268540,-116.400575,4.97,1.0,21,1.0,16,1.0,1.000,1.0
5466,503139107,carmelita's mexican grill,33.2565,-116.3796,DEH2007-FFPP-424498,carmelitas mexican grill & cantina,33.255550,-116.379450,106.60,1.0,11,1.0,5,1.0,1.000,1.0
5467,589274091,krazy coyote saloon & grille,33.2685,-116.4006,DEH2002-FFPP-391359,the palms at indian head,33.268540,-116.400575,4.97,1.0,24,1.0,41,1.0,1.000,1.0


In [8]:
print (f"Size of G->E weighted model outcome:  {gov_esri_match_final.shape[0]}")
print (f"Unique G in G->E match: {gov_esri_match_final.G_RECORDID.unique().shape[0]}")
print (f"Unique E in G->E match: {gov_esri_match_final.E_RECORDID.unique().shape[0]}")
print ("~~")
print (f"Size of E->G weighted model outcome:  {esri_gov_match_final.shape[0]}")
print (f"Unique E in E->G match: {esri_gov_match_final.E_RECORDID.unique().shape[0]}")
print (f"Unique G in E->G match: {esri_gov_match_final.G_RECORDID.unique().shape[0]}")

print ('-'*50)

print (f"Size of G->Y weighted model outcome:  {gov_yelp_match_final.shape[0]}") 
print (f"Unique G in G->Y match: {gov_yelp_match_final.G_RECORDID.unique().shape[0]}")
print (f"Unique Y in G->Y match: {gov_yelp_match_final.Y_RECORDID.unique().shape[0]}")
print ("~~")
print (f"Size of Y->G weighted model outcome:  {yelp_gov_match_final.shape[0]}") 
print (f"Unique Y in Y->G match: {yelp_gov_match_final.Y_RECORDID.unique().shape[0]}")
print (f"Unique G in Y->G match: {yelp_gov_match_final.G_RECORDID.unique().shape[0]}")

print ('-'*50)

print (f"Size of E->Y weighted model outcome:  {esri_yelp_match_final.shape[0]}") 
print (f"Unique E in E->Y match: {esri_yelp_match_final.E_RECORDID.unique().shape[0]}")
print (f"Unique Y in E->Y match: {esri_yelp_match_final.Y_RECORDID.unique().shape[0]}")
print ("~~")
print (f"Size of Y->E weighted model outcome:  {yelp_esri_match_final.shape[0]}") 
print (f"Unique Y in Y->G match: {yelp_esri_match_final.Y_RECORDID.unique().shape[0]}")
print (f"Unique E in Y->G match: {yelp_esri_match_final.E_RECORDID.unique().shape[0]}")

Size of G->E weighted model outcome:  4254
Unique G in G->E match: 4254
Unique E in G->E match: 2542
~~
Size of E->G weighted model outcome:  5469
Unique E in E->G match: 5469
Unique G in E->G match: 2361
--------------------------------------------------
Size of G->Y weighted model outcome:  4382
Unique G in G->Y match: 4382
Unique Y in G->Y match: 2705
~~
Size of Y->G weighted model outcome:  6514
Unique Y in Y->G match: 6514
Unique G in Y->G match: 2647
--------------------------------------------------
Size of E->Y weighted model outcome:  5643
Unique E in E->Y match: 5643
Unique Y in E->Y match: 3128
~~
Size of Y->E weighted model outcome:  6767
Unique Y in Y->G match: 6767
Unique E in Y->G match: 3346


---
#### optional: briefly check results

In [None]:
#### 
gov_esri_match_final.loc[gov_esri_match_final.duplicated('E_RECORDID',keep=False)].sort_values('E_RECORDID').head(10)

In [None]:
esri_gov_match_final.loc[esri_gov_match_final.E_RECORDID == 172795940]

In [None]:
gov_esri_match_final.loc[gov_esri_match_final.G_RECORDID == 'DEH2007-FFPP-423835'][cols_GE].sort_values('E_RECORDID')

In [None]:
esri_gov_match_final.loc[esri_gov_match_final.E_RECORDID == 172795940][cols_GE].sort_values('G_RECORDID')

### 2.b. Merging bi-directional combinations ( G->E and E->G) to get the TRUE 2-way match (G<->E)

In [34]:
print ('-'*15, ' G<->E ', '-'*15)

# Get true G<->E match:  G-E  +  E-G
G_E_both_ways = gov_esri_match_final[cols_GE].merge(esri_gov_match_final[cols_EG],on=cols_GE, how='inner')
print (G_E_both_ways.shape[0])
print (len(G_E_both_ways.G_RECORDID.unique()))
print (len(G_E_both_ways.E_RECORDID.unique()))

print ('-'*15, ' G<->Y ', '-'*15)

# Get true G<->Y match:  G-Y  +  Y-G
G_Y_both_ways = gov_yelp_match_final[cols_GY].merge(yelp_gov_match_final[cols_YG],on=cols_GY, how='inner')
print (G_Y_both_ways.shape[0])
print (len(G_Y_both_ways.G_RECORDID.unique()))
print (len(G_Y_both_ways.Y_RECORDID.unique()))

print ('-'*15, ' E<->Y ', '-'*15)

# Get true E<->Y match:  E-Y  +  Y-E
E_Y_both_ways = esri_yelp_match_final[cols_EY].merge(yelp_esri_match_final[cols_YE],on=cols_EY, how='inner')
print (E_Y_both_ways.shape[0])
print (len(E_Y_both_ways.E_RECORDID.unique()))
print (len(E_Y_both_ways.Y_RECORDID.unique()))

---------------  G<->E  ---------------
1418
1418
1418
---------------  G<->Y  ---------------
1616
1616
1616
---------------  E<->Y  ---------------
1810
1810
1810


---
#### optional: briefly check results

In [17]:
gov_esri_match_final.loc[gov_esri_match_final.G_RECORDID == 'DEH2008-FFPP-431365']

Unnamed: 0,G_RECORDID,G_NAME,G_LAT,G_LONG,E_RECORDID,E_NAME,E_LAT,E_LONG,DIST,DIST_rank,LEV,LEV_rank,DM,DM_rank,Model_score,Model_rank
4250,DEH2008-FFPP-431365,panda machi chinese & japanese cuisine,32.838382,-116.777007,495705014,mananas no 1 mexican food,32.8384,-116.777,2.08,1.0,28,1.0,22,2.0,1.112,1.0


In [19]:
esri_gov_match_final.loc[esri_gov_match_final.E_RECORDID == 495705014]

Unnamed: 0,E_RECORDID,E_NAME,E_LAT,E_LONG,G_RECORDID,G_NAME,G_LAT,G_LONG,DIST,DIST_rank,LEV,LEV_rank,DM,DM_rank,Model_score,Model_rank
4414,495705014,mananas no 1 mexican food,32.8384,-116.777,DEH2016-FFPP-006945,ayres lodge alpine,32.838677,-116.777946,93.59,3.0,19,1.0,10,4.0,1.752,1.0


In [16]:
G_E_both_ways.loc[G_E_both_ways.G_RECORDID == 'DEH2008-FFPP-431365']

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID


In [13]:
esri_gov_match_final.loc[esri_gov_match_final.E_RECORDID == 619417843]

Unnamed: 0,E_RECORDID,E_NAME,E_LAT,E_LONG,G_RECORDID,G_NAME,G_LAT,G_LONG,DIST,DIST_rank,LEV,LEV_rank,DM,DM_rank,Model_score,Model_rank
12,619417843,wetzel's pretzels,32.5454,-117.0405,DEH2014-FFPP-004032,wetzels pretzels,32.544055,-117.040592,149.75,3.0,1,1.0,6,1.0,1.416,1.0


In [21]:
G_E_both_ways.head()

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID
0,wetzels pretzels,DEH2014-FFPP-004032,wetzel's pretzels,619417843
1,dollar tree,DEH2017-FFPN-001194,food shop,722351465
2,fruiteria el tigre,DEH2016-FFPP-007027,extramile,722351942
3,smart and final,DEH2016-FFPP-006277,baja mex inc,421718245
4,best western americana inn,DEH2014-FFPP-003911,robertacos mexican food,495750507


### 2.c. Merging 3 set to obtain 3-way match results, and drop duplicates

In [35]:
### gov-esri-yelp  (gov_esri + gov_yelp)
tier2_3way_v1 = G_E_both_ways[cols_GE].merge(G_Y_both_ways[cols_GY],on=['G_RECORDID','G_NAME'],how='inner')
print (len(tier2_3way_v1))

### gov_esri_yelp  (gov_esri + esri_yelp)
tier2_3way_v2 = G_E_both_ways[cols_GE].merge(E_Y_both_ways[cols_EY],on=['E_RECORDID','E_NAME'],how='inner')
print (len(tier2_3way_v2))

### gov_esri_yelp  (gov_yelp + esri_yelp)
tier2_3way_v3 = G_Y_both_ways[cols_GY].merge(E_Y_both_ways[cols_EY],on=['Y_RECORDID','Y_NAME'],how='inner')
print (len(tier2_3way_v3))



#merge all three combinations
tier2_3way_all=pd.concat([tier2_3way_v1,tier2_3way_v2, tier2_3way_v3], ignore_index=True)

### Drop duplicates if all columns are the same
tier2_3way_final=tier2_3way_all.drop_duplicates()
print (len(tier2_3way_final))

792
670
681
1487


---
#### optional: briefly check results

In [22]:
### Check if there are duplicates based on G_RECORDID
tier2_3way_final[tier2_3way_final.duplicated(['G_RECORDID'], keep=False)].sort_values(['G_RECORDID'])

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID
2028,texas liquor,DEH2002-FFPN-301509,nozomi,648825255,el tejate,yelp11244
669,texas liquor,DEH2002-FFPN-301509,el puerto,420979312,el tejate,yelp11244
557,la jolla discount pharmacy,DEH2002-FFPN-302039,la jolla discount pharmacy,469133714,la jolla discount pharmacy,yelp11340
1258,la jolla discount pharmacy,DEH2002-FFPN-302039,la jolla discount pharmacy,469133714,colors cafe,yelp487
536,dales liquor,DEH2002-FFPN-302691,dale's liquor store,173024506,balboa liquor store,yelp5375
...,...,...,...,...,...,...
1243,the mandarin,DEH2017-FFPP-008889,mandarin,972671572,the mandarin,yelp2916
713,d sotos taco shop,DEH2017-FFPP-008904,amigos mexican food,707770261,d' sotos,yelp7289
2072,d sotos taco shop,DEH2017-FFPP-008904,7-eleven,353672264,d' sotos,yelp7289
202,annapurna restaurant,DEH2017-FFPP-009003,pho ca dao restaurant,241916899,annapurna indian cuisine,yelp11211


In [23]:
tier2_3way_final.loc[tier2_3way_final.E_RECORDID == 469133714]
# tier2_3way_final.loc[tier2_3way_final.Y_RECORDID == 'yelp487']

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID
557,la jolla discount pharmacy,DEH2002-FFPN-302039,la jolla discount pharmacy,469133714,la jolla discount pharmacy,yelp11340
1258,la jolla discount pharmacy,DEH2002-FFPN-302039,la jolla discount pharmacy,469133714,colors cafe,yelp487


### 2.d. Further drop duplicates coming from different sets, process the 3-way match reulsts -->  Step 1: Using a combined lev_dist_score

In [36]:
### Condition 1:  calculate a combined lev_dist_score 
tier2_3way_final['lev_dist_GE'] = tier2_3way_final.copy().apply(lambda x: lev.distance(x['G_NAME'],  str(x['E_NAME'])), axis=1)
tier2_3way_final['lev_dist_GY'] = tier2_3way_final.copy().apply(lambda x: lev.distance(x['G_NAME'],  x['Y_NAME']), axis=1)
tier2_3way_final['lev_dist_EY'] = tier2_3way_final.copy().apply(lambda x: lev.distance(str(x['E_NAME']),  x['Y_NAME']), axis=1)
tier2_3way_final['lev_dist_score'] = tier2_3way_final['lev_dist_GE'] + tier2_3way_final['lev_dist_GY'] + tier2_3way_final['lev_dist_EY']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tier2_3way_final['lev_dist_GE'] = tier2_3way_final.copy().apply(lambda x: lev.distance(x['G_NAME'],  str(x['E_NAME'])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tier2_3way_final['lev_dist_GY'] = tier2_3way_final.copy().apply(lambda x: lev.distance(x['G_NAME'],  x['Y_NAME']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

In [37]:
### Use the lev_dist_score to select the lowest one(s)

### Use G-E to clean Y
Y_lev = tier2_3way_final.groupby(['G_RECORDID','E_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean = tier2_3way_final.loc[tier2_3way_final.lev_dist_score == Y_lev]

### Use G-Y to clean E
E_lev = tier2_3way_final_clean.groupby(['G_RECORDID','Y_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean = tier2_3way_final_clean.loc[tier2_3way_final_clean.lev_dist_score == E_lev]

### Use E-Y to clean G
G_lev = tier2_3way_final_clean.groupby(['E_RECORDID','Y_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean = tier2_3way_final_clean.loc[tier2_3way_final_clean.lev_dist_score == G_lev]


print (tier2_3way_final_clean.shape[0])

1177


---
#### optional: briefly check results

In [None]:
tier2_3way_final_clean

In [None]:
tier2_3way_final_clean.loc[tier2_3way_final_clean.E_RECORDID == 469133714]

In [26]:
tier2_3way_final_clean[tier2_3way_final_clean.G_RECORDID.duplicated(keep=False)].sort_values(['G_RECORDID','lev_dist_score'])

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
280,garys arco,DEH2002-FFPN-312851,arco,437250026,mi casita,yelp4870,6,10,8,24
1700,garys arco,DEH2002-FFPN-312851,five star,301274304,mi casita,yelp4870,8,10,6,24
212,fifty9fifty,DEH2002-FFPP-301084,subway,432176328,edamami,yelp11869,10,10,6,26
968,fifty9fifty,DEH2002-FFPP-301084,subway,432176328,shell,yelp11489,10,11,5,26
1383,rodeos meat market,DEH2002-FFPP-313739,rodeo's meat,855088985,pepita's cafe,yelp782,8,14,10,32
...,...,...,...,...,...,...,...,...,...,...
1744,dennys,DEH2017-FFPP-007805,souplantation,173282682,kona kakes,yelp5715,12,8,10,30
338,coaster saloon and bar and grill,DEH2017-FFPP-008464,coaster bar & grill,500995618,sandbar sports bar & grill,yelp9323,14,17,11,42
1067,coaster saloon and bar and grill,DEH2017-FFPP-008464,coaster bar & grill,500995618,coaster saloon,yelp291,14,18,10,42
412,tabac,DEH2017-FFPP-008759,subway,700624002,taka,yelp7292,4,2,5,11


In [31]:
tier2_3way_final_clean[tier2_3way_final_clean.E_RECORDID.duplicated(keep=False)].sort_values(['E_RECORDID','lev_dist_score'])

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
475,starbucks coffee,DEH2003-FFPP-405596,starbucks,216866350,starbucks,yelp3507,8,8,0,16
1192,starbucks coffee,DEH2003-FFPP-405596,starbucks,216866350,starbucks,yelp10781,8,8,0,16
333,usa,DEH2012-FFPP-448496,shell,301847471,caffe vicino,yelp10233,5,11,11,27
1685,shell food mart,DEH2005-FFPN-417607,shell,301847471,clairemont shell,yelp7940,10,14,11,35
459,five guys,DEH2011-FFPP-441761,subway,402545095,five guys,yelp11531,8,0,8,16
1171,five guys,DEH2011-FFPP-441761,subway,402545095,subway,yelp11846,8,8,0,16
105,philz coffee,DEH2016-FFPP-007010,kanaya fine tea,414664401,philz coffee,yelp8239,13,0,13,26
1557,goodonya deli,DEH2014-FFPP-004520,kanaya fine tea,414664401,carin de ria,yelp1444,13,10,10,33
673,marshalls,DEH2003-FFPN-403435,domino's,423553409,shell,yelp4045,9,6,8,23
2034,dominos pizza 8548,DEH2012-FFPP-450084,domino's,423553409,domino's pizza,yelp5057,12,6,6,24


### 2.e. Further drop duplicates coming from different sets, process the 3-way match reulsts -->  Step 2: Taking the one with smallest IDs

In [38]:
print (tier2_3way_final_clean.shape[0])

##### G -->  E --> Y
print ("cleaning order:  G -->  E --> Y")

tier2_3way_final_clean_GEY = tier2_3way_final_clean.sort_values(['lev_dist_score', 'E_RECORDID', 'Y_RECORDID', 'G_RECORDID'],ascending=True).groupby('G_RECORDID').head(1)
print (tier2_3way_final_clean_GEY.shape[0])

tier2_3way_final_clean_GEY = tier2_3way_final_clean_GEY.sort_values(['lev_dist_score', 'G_RECORDID', 'Y_RECORDID','E_RECORDID'],ascending=True).groupby('E_RECORDID').head(1)
print (tier2_3way_final_clean_GEY.shape[0])

tier2_3way_final_clean_GEY = tier2_3way_final_clean_GEY.sort_values(['lev_dist_score', 'G_RECORDID', 'E_RECORDID','Y_RECORDID'],ascending=True).groupby('Y_RECORDID').head(1)
print (tier2_3way_final_clean_GEY.shape[0])


print ('-'*30)
print ('Checking duplicates:')
print (tier2_3way_final_clean_GEY[tier2_3way_final_clean_GEY.G_RECORDID.duplicated(keep=False)].sort_values(['G_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_GEY[tier2_3way_final_clean_GEY.E_RECORDID.duplicated(keep=False)].sort_values(['E_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_GEY[tier2_3way_final_clean_GEY.Y_RECORDID.duplicated(keep=False)].sort_values(['Y_RECORDID','lev_dist_score']).shape[0])

# tier2_3way_final_clean.to_csv('tier2_3match_20230512.csv',index=False)

1177
cleaning order:  G -->  E --> Y
1142
1125
1114
------------------------------
Checking duplicates:
0
0
0


In [41]:
print (tier2_3way_final_clean.shape[0])

##### E -->  Y --> G
print ("cleaning order:  E -->  Y --> G")

tier2_3way_final_clean_EYG = tier2_3way_final_clean.sort_values(['lev_dist_score', 'G_RECORDID', 'Y_RECORDID','E_RECORDID'],ascending=True).groupby('E_RECORDID').head(1)
print (tier2_3way_final_clean_EYG.shape[0])

tier2_3way_final_clean_EYG = tier2_3way_final_clean_EYG.sort_values(['lev_dist_score', 'G_RECORDID', 'E_RECORDID','Y_RECORDID'],ascending=True).groupby('Y_RECORDID').head(1)
print (tier2_3way_final_clean_EYG.shape[0])

tier2_3way_final_clean_EYG = tier2_3way_final_clean_EYG.sort_values(['lev_dist_score', 'E_RECORDID', 'Y_RECORDID','G_RECORDID'],ascending=True).groupby('G_RECORDID').head(1)
print (tier2_3way_final_clean_EYG.shape[0])


print ('-'*30)
print ('Checking duplicates:')
print (tier2_3way_final_clean_EYG[tier2_3way_final_clean_EYG.G_RECORDID.duplicated(keep=False)].sort_values(['G_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_EYG[tier2_3way_final_clean_EYG.E_RECORDID.duplicated(keep=False)].sort_values(['E_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_EYG[tier2_3way_final_clean_EYG.Y_RECORDID.duplicated(keep=False)].sort_values(['Y_RECORDID','lev_dist_score']).shape[0])

# tier2_3way_final_clean.to_csv('tier2_3match_20230512.csv',index=False)

1177
cleaning order:  E -->  Y --> G
1149
1123
1113
------------------------------
Checking duplicates:
0
0
0


In [42]:
print (tier2_3way_final_clean.shape[0])

##### Y -->  G --> E
print ("cleaning order:  Y -->  G --> E")

tier2_3way_final_clean_YGE = tier2_3way_final_clean.sort_values(['lev_dist_score', 'G_RECORDID', 'E_RECORDID','Y_RECORDID'],ascending=True).groupby('Y_RECORDID').head(1)
print (tier2_3way_final_clean_YGE.shape[0])

tier2_3way_final_clean_YGE = tier2_3way_final_clean_YGE.sort_values(['lev_dist_score', 'E_RECORDID', 'Y_RECORDID','G_RECORDID'],ascending=True).groupby('G_RECORDID').head(1)
print (tier2_3way_final_clean_YGE.shape[0])

tier2_3way_final_clean_YGE = tier2_3way_final_clean_YGE.sort_values(['lev_dist_score', 'G_RECORDID', 'Y_RECORDID','E_RECORDID'],ascending=True).groupby('E_RECORDID').head(1)
print (tier2_3way_final_clean_YGE.shape[0])


print ('-'*30)
print ('Checking duplicates:')
print (tier2_3way_final_clean_YGE[tier2_3way_final_clean_YGE.G_RECORDID.duplicated(keep=False)].sort_values(['G_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_YGE[tier2_3way_final_clean_YGE.E_RECORDID.duplicated(keep=False)].sort_values(['E_RECORDID','lev_dist_score']).shape[0])
print (tier2_3way_final_clean_YGE[tier2_3way_final_clean_YGE.Y_RECORDID.duplicated(keep=False)].sort_values(['Y_RECORDID','lev_dist_score']).shape[0])

# tier2_3way_final_clean.to_csv('tier2_3match_20230512.csv',index=False)

1177
cleaning order:  Y -->  G --> E
1139
1120
1113
------------------------------
Checking duplicates:
0
0
0


In [43]:
tier2_3way_final_clean_YGE.to_csv('tier2_3match_20230515.csv',index=False)

In [51]:
#Take 3 match from each of the original data, prep for 2 match
#tier1_3way_final_clean=pd.read_csv('tier2_3match_20230515.csv')

#read original data for tier 2
gov_all=pd.read_csv("gov_for_tier2_0512.csv")
esri_all=pd.read_csv("esri_for_tier2_0512.csv")
yelp_all=pd.read_csv("yelp_for_tier2_0512.csv")

gov_tier2_for_2match=gov_all.loc[~gov_all['G_RECORDID'].isin(tier2_3way_final_clean_YGE['G_RECORDID'])]
esri_tier2_for_2match=esri_all.loc[~esri_all['E_RECORDID'].isin(tier2_3way_final_clean_YGE['E_RECORDID'])]
yelp_tier2_for_2match=yelp_all.loc[~yelp_all['Y_RECORDID'].isin(tier2_3way_final_clean_YGE['Y_RECORDID'])]

gov_tier2_for_2match.to_csv('gov_for_tier2_2match_0515.csv')
esri_tier2_for_2match.to_csv('esri_for_tier2_2match_0515.csv')
yelp_tier2_for_2match.to_csv('yelp_for_tier2_2match_0515.csv')

In [52]:
print(gov_tier2_for_2match.shape[0])
print(esri_tier2_for_2match.shape[0])
print(yelp_tier2_for_2match.shape[0])

3537
4853
6175


---
### Results check

In [44]:
#### Find difference among three set

biz_GEY = list(zip(tier2_3way_final_clean_GEY['G_RECORDID'],tier2_3way_final_clean_GEY['E_RECORDID'],tier2_3way_final_clean_GEY['Y_RECORDID']))
biz_EYG = list(zip(tier2_3way_final_clean_EYG['G_RECORDID'],tier2_3way_final_clean_EYG['E_RECORDID'],tier2_3way_final_clean_EYG['Y_RECORDID']))
biz_YGE = list(zip(tier2_3way_final_clean_YGE['G_RECORDID'],tier2_3way_final_clean_YGE['E_RECORDID'],tier2_3way_final_clean_YGE['Y_RECORDID']))

print (set(biz_GEY) - set(biz_EYG))
print (set(biz_GEY) - set(biz_YGE))
print (set(biz_EYG) - set(biz_YGE))

{('DEH2017-FFPP-008456', 706321690, 'yelp5341')}
{('DEH2017-FFPP-008456', 706321690, 'yelp5341')}
set()


In [45]:
tier2_3way_final_clean.loc[tier2_3way_final_clean.E_RECORDID==706321690]

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
1187,riverwalk grill,DEH2017-FFPP-008456,rita's water ice,706321690,the wrapshack,yelp5341,12,13,12,37
1859,santee girls asa,DEH2005-FFPP-417406,rita's water ice,706321690,the wrapshack,yelp5341,14,11,12,37


In [72]:
# 'DEH2017-FFPP-008456', 706321690, 'yelp5341'

test_E = tier2_3way_final_clean.sort_values(['lev_dist_score', 'G_RECORDID', 'Y_RECORDID','E_RECORDID'],ascending=True).groupby('E_RECORDID').head(1)

test_E_Y = test_E.sort_values(['lev_dist_score', 'G_RECORDID', 'E_RECORDID','Y_RECORDID'],ascending=True).groupby('Y_RECORDID').head(1)

test_E_Y_G = test_E_Y.sort_values(['lev_dist_score', 'E_RECORDID', 'Y_RECORDID','G_RECORDID'],ascending=True).groupby('G_RECORDID').head(1)

In [77]:
test_E_Y_G.loc[test_E_Y_G.G_RECORDID=='DEH2005-FFPP-417406']

# test_E.loc[test_E.G_RECORDID=='DEH2017-FFPP-008456']
# test_E.loc[test_E.E_RECORDID==706321690]
test_E_Y_G.loc[test_E_Y_G.Y_RECORDID=='yelp5341']

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
1186,santee girls asa,DEH2005-FFPP-417406,food 4 less,894704576,food 4 less,yelp6900,13,13,0,26


In [50]:
tier2_3way_final_clean.loc[tier2_3way_final_clean.G_RECORDID == 'DEH2005-FFPP-417406']

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
1186,santee girls asa,DEH2005-FFPP-417406,food 4 less,894704576,food 4 less,yelp6900,13,13,0,26
1859,santee girls asa,DEH2005-FFPP-417406,rita's water ice,706321690,the wrapshack,yelp5341,14,11,12,37


In [47]:
tier2_3way_final_clean_YGE.loc[tier2_3way_final_clean_YGE.E_RECORDID==706321690]

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score


In [69]:
tier2_3way_final_clean_EYG.loc[tier2_3way_final_clean_EYG.E_RECORDID==706321690]

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score


---
#### optional: briefly check results

In [86]:
tier2_3way_final_clean_GEY.loc[tier2_3way_final_clean_GEY.lev_dist_score == 0]

Unnamed: 0,G_NAME,G_RECORDID,E_NAME,E_RECORDID,Y_NAME,Y_RECORDID,lev_dist_GE,lev_dist_GY,lev_dist_EY,lev_dist_score
557,la jolla discount pharmacy,DEH2002-FFPN-302039,la jolla discount pharmacy,469133714,la jolla discount pharmacy,yelp11340,0,0,0,0
410,edgewater grill,DEH2002-FFPP-300663,edgewater grill,173162512,edgewater grill,yelp10716,0,0,0,0
1807,seaport village deli,DEH2002-FFPP-309766,seaport village deli,886337955,seaport village deli,yelp10101,0,0,0,0
1571,mexico viejo,DEH2004-FFPP-413398,mexico viejo,228272159,mexico viejo,yelp3684,0,0,0,0
1394,casa de pico,DEH2005-FFPP-416740,casa de pico,503104937,casa de pico,yelp3943,0,0,0,0
1528,sprouts farmers market,DEH2007-FFPP-423677,sprouts farmers market,639571579,sprouts farmers market,yelp2936,0,0,0,0
751,sprouts farmers market,DEH2007-FFPP-425473,sprouts farmers market,180563033,sprouts farmers market,yelp10463,0,0,0,0
664,campus cafe,DEH2008-FFPP-428732,campus cafe,974375321,campus cafe,yelp180,0,0,0,0
249,los primos mexican food,DEH2008-FFPP-428975,los primos mexican food,666616677,los primos mexican food,yelp2755,0,0,0,0
1572,hooked on sushi,DEH2009-FFPP-435880,hooked on sushi,414662901,hooked on sushi,yelp5486,0,0,0,0


---

## Previous way of Merging 3 set for 3-way match results

In [None]:

### gov-esri-yelp  (gov_esri + gov_yelp)
tier2_3way_v1_old = gov_esri_match_final[cols_GE].merge(gov_yelp_match_final[cols_GY],on='G_RECORDID',how='inner').drop('G_NAME_y',axis=1).rename({'G_NAME_x': 'G_NAME'}, axis=1)
print (len(tier2_3way_v1_old ))

### gov_esri_yelp  (gov_esri + esri_yelp)
tier2_3way_v2_old  = gov_esri_match_final[cols_GE].merge(esri_yelp_match_final[cols_EY],on='E_RECORDID',how='inner').drop('E_NAME_y',axis=1).rename({'E_NAME_x':'E_NAME'},axis=1)
print (len(tier2_3way_v2_old ))

### gov_esri_yelp  (gov_yelp + esri_yelp)
tier2_3way_v3_old  = gov_yelp_match_final[cols_GY].merge(esri_yelp_match_final[cols_EY],on='Y_RECORDID',how='inner').drop('Y_NAME_y',axis=1).rename({'Y_NAME_x':'Y_NAME'},axis=1)
print (len(tier2_3way_v3_old ))

In [None]:
#merge all three combinations
tier2_3way_all_old=pd.concat([tier2_3way_v1_old ,tier2_3way_v2_old , tier2_3way_v3_old ], ignore_index=True)

### Drop duplicates if all columns are the same
tier2_3way_final_old =tier2_3way_all_old .drop_duplicates()
print (len(tier2_3way_final_old ))

In [None]:
### Check if there are duplicates based on G_RECORDID
tier2_3way_final_old [tier2_3way_final_old.duplicated(['G_RECORDID'], keep=False)].sort_values(['G_RECORDID'])

### Further drop duplicates coming from different sets

In [None]:
### Calculate 
tier2_3way_final_old['lev_dist_GE'] = tier2_3way_final_old.copy().apply(lambda x: lev.distance(x['G_NAME'],  str(x['E_NAME'])), axis=1)
tier2_3way_final_old['lev_dist_GY'] = tier2_3way_final_old.copy().apply(lambda x: lev.distance(x['G_NAME'],  x['Y_NAME']), axis=1)
tier2_3way_final_old['lev_dist_EY'] = tier2_3way_final_old.copy().apply(lambda x: lev.distance(str(x['E_NAME']),  x['Y_NAME']), axis=1)
tier2_3way_final_old['lev_dist_score'] = tier2_3way_final_old['lev_dist_GE'] + tier2_3way_final_old['lev_dist_GY'] + tier2_3way_final_old['lev_dist_EY']

In [None]:
### Use G-E to clean Y
Y_lev = tier2_3way_final_old.groupby(['G_RECORDID','E_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean_old = tier2_3way_final_old.loc[tier2_3way_final_old.lev_dist_score == Y_lev]

### Use G-Y to clean E
E_lev = tier2_3way_final_clean_old.groupby(['G_RECORDID','Y_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean_old = tier2_3way_final_clean_old.loc[tier2_3way_final_clean_old.lev_dist_score == E_lev]

### Use E-Y to clean G
G_lev = tier2_3way_final_clean_old.groupby(['E_RECORDID','Y_RECORDID']).lev_dist_score.transform(min)
tier2_3way_final_clean_old = tier2_3way_final_clean_old.loc[tier2_3way_final_clean_old.lev_dist_score == G_lev]


print (tier2_3way_final_clean_old.shape[0])
# tier2_3way_final_clean_old.to_csv('tier2_3match_20230510.csv',index=False)

In [None]:
tier2_3way_final_clean_old[tier2_3way_final_clean_old.G_RECORDID.duplicated(keep=False)].sort_values(['G_RECORDID','lev_dist_score'])

In [None]:
tier2_3way_final_clean_old[tier2_3way_final_clean_old.E_RECORDID.duplicated(keep=False)].sort_values(['E_RECORDID','lev_dist_score'])

In [None]:
tier2_3way_final_clean_old[tier2_3way_final_clean_old.Y_RECORDID.duplicated(keep=False)].sort_values(['Y_RECORDID','lev_dist_score'])

---
# Tier 2 Matching 2 datasets (2match)

In [None]:
GE_pairs_from2match = list(zip(tier2_3way_final_clean.G_RECORDID, tier2_3way_final_clean.E_RECORDID	))
GY_pairs_from2match = list(zip(tier2_3way_final_clean.G_RECORDID, tier2_3way_final_clean.Y_RECORDID	))
EY_pairs_from2match = list(zip(tier2_3way_final_clean.E_RECORDID, tier2_3way_final_clean.Y_RECORDID	))

print (len(GE_pairs_from2match))
print (len(GY_pairs_from2match))
print (len(EY_pairs_from2match))

In [None]:
# gov_esri_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_G_to_E.csv')
# gov_yelp_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_G_to_Y.csv')
# esri_yelp_match_final=pd.read_csv('Tier2_WeightedRankModel_v4_E_to_Y.csv')

In [None]:
### Take out the 3way match results from each of the above 2 match [G-E]
gov_esri_match_final['GE_ID'] = list(zip(gov_esri_match_final.G_RECORDID, gov_esri_match_final.E_RECORDID))
gov_esri_2match = gov_esri_match_final.copy().loc[~gov_esri_match_final['GE_ID'].isin(GE_pairs_from2match)]
gov_esri_2match[cols_GE].to_csv('gov_esri_tier2_2match_20230510.csv',index=False)

print (len(gov_esri_2match))
#The result below is not equal to 4254-3552=702 (<1642), meaning some of the 3match come from the other two 2match results


### Take out the 3way match results from each of the above 2 match [G-Y]
gov_yelp_match_final['GY_ID'] = list(zip(gov_yelp_match_final.G_RECORDID, gov_yelp_match_final.Y_RECORDID))
gov_yelp_2match = gov_yelp_match_final.copy().loc[~gov_yelp_match_final['GY_ID'].isin(GY_pairs_from2match)]
gov_yelp_2match[cols_GY].to_csv('gov_yelp_tier2_2match_20230510.csv',index=False)

print (len(gov_yelp_2match))
#The result below is not equal to 4382-3552=830 (<1852), meaning some of the 3match come from the other two 2match results


### Take out the 3way match results from each of the above 2 match [E-Y]
esri_yelp_match_final['EY_ID'] = list(zip(esri_yelp_match_final.E_RECORDID, esri_yelp_match_final.Y_RECORDID))
esri_yelp_2match=esri_yelp_match_final.copy().loc[~esri_yelp_match_final['EY_ID'].isin(EY_pairs_from2match)]

esri_yelp_2match[cols_EY].to_csv('esri_yelp_tier2_2match_20230510.csv',index=False)

print (len(esri_yelp_2match))
#The result below is not equal to 5643-3552=2091 (<3171), meaning some of the 3match come from the other two 2match results

# Tier 2 Matching

### 1. Data prep

In [None]:
ID_G_3match = list(tier2_3way_final_clean['G_RECORDID'])
ID_E_3match = list(tier2_3way_final_clean['E_RECORDID'])
ID_Y_3match = list(tier2_3way_final_clean['Y_RECORDID'])

ID_G_2match = [*set(list(gov_esri_2match['G_RECORDID'])  +  list(gov_yelp_2match['G_RECORDID']))] 
ID_E_2match = [*set(list(gov_esri_2match['E_RECORDID'])  +  list(esri_yelp_2match['E_RECORDID']))]  
ID_Y_2match = [*set(list(gov_yelp_2match['Y_RECORDID'])  +  list(esri_yelp_2match['Y_RECORDID']))]

print (f"G --> 3match: {len(ID_G_3match)}  ||  2match: {len(ID_G_2match)}")
print (f"E --> 3match: {len(ID_E_3match)}  ||  2match: {len(ID_E_2match)}")
print (f"Y --> 3match: {len(ID_Y_3match)}  ||  2match: {len(ID_Y_2match)}")

In [None]:
#read original data
gov_for_tier2=pd.read_csv("gov_for_tier2.csv")
esri_for_tier2=pd.read_csv("esri_for_tier2.csv")
yelp_for_tier2=pd.read_csv("yelp_for_tier2.csv")

In [None]:
#remove the tier 1 "for 3 match" (this contains everything from tier 1,both 3 match and 2 match) from each of the original data
#these data are:esri_yelp_match_final, gov_esri_match_final,gov_yelp_match_final

gov_nonMatch  = gov_for_tier2.loc[(~gov_for_tier2.G_RECORDID.isin(ID_G_3match)) & (~gov_for_tier2.G_RECORDID.isin(ID_G_2match))]
esri_nonMatch = esri_for_tier2.loc[(~esri_for_tier2.E_RECORDID.isin(ID_E_3match)) & (~esri_for_tier2.E_RECORDID.isin(ID_E_2match))]
yelp_nonMatch = yelp_for_tier2.loc[(~yelp_for_tier2.Y_RECORDID.isin(ID_Y_3match)) & (~yelp_for_tier2.Y_RECORDID.isin(ID_Y_2match))]

print('Original Data for Tier 2:',len(gov_for_tier2),len(esri_for_tier2),len(yelp_for_tier2))
print('Non-Match after Tier 2:',len(gov_nonMatch),len(esri_nonMatch),len(yelp_nonMatch))

gov_nonMatch[['G_RECORDID','G_NAME','G_LAT','G_LONG']].to_csv("gov_nonMatch.csv", index=False)
esri_nonMatch[['E_RECORDID','E_NAME','E_LAT','E_LONG']].to_csv("esri_nonMatch.csv", index=False)
yelp_nonMatch[['Y_RECORDID','Y_NAME','Y_LAT','Y_LONG']].to_csv("yelp_nonMatch.csv", index=False)

---
# Get Non-Match Business from BASE

In [None]:
### Get Non-Match Business from BASE

### a list of all the business IDs (BASE) that got matched by the weighted rank model
biz_matched_base = list(match_results_base_target[column_ID_base].unique()) + list(match_results_base_target_2[column_ID_base].unique())
biz_matched_base_clean = list(set(biz_matched_base))

pd.DataFrame(biz_matched_base_clean, columns=[column_BizName_base]).to_csv(f'Tier2_WeightedRankModel_matched_BusinessID_{base_dataset}.csv',index=False)
print (f'{base_dataset}: Matched --> {len(biz_matched_base_clean)}')

### get a list of all the unique business ID (BASE) that didn't get matched by the weighted rank model
biz_notMatched_base_df = df_tier1_results.loc[~df_tier1_results[column_ID_base].isin(biz_matched_base_clean)]
biz_notMatched_base_df.to_csv(f'Tier2_WeightedRankModel_noMatched_BusinessID_{base_dataset}.csv',index=False)

print (f'{base_dataset}: NotMatched --> {len(biz_notMatched_base_df)}')