In [37]:
import pandas as pd
import numpy as np
import os

from thefuzz import fuzz #new fuzzywuzzy project https://github.com/seatgeek/thefuzz
import textdistance

# Config

In [2]:
#config files
configPath = os.path.join(os.getcwd(), 'config')
dictionariesPath = os.path.join(os.getcwd(), 'dicts')

In [3]:
#input
inputPath = os.path.join(os.getcwd(), 'example/instances')

base1 = pd.read_csv(f"{inputPath}/inst1/base_inst1.csv")
base2 = pd.read_csv(f"{inputPath}/inst2/base_inst2.csv")

#output
outputPath = os.path.join(inputPath, 'matches')
if not os.path.exists(outputPath):
    os.makedirs(outputPath)
    
outputFileName = 'candidateList'

In [4]:
variableFields = pd.read_csv(f"{configPath}/variableFields.csv").set_index('variable')
compatibleData = pd.read_csv(f"{configPath}/compatible_data.csv").set_index('variable')

comparisonSettings = compatibleData.join(variableFields).to_dict(orient='index')

compatibleData.join(variableFields) #just to visualize

Unnamed: 0_level_0,type,parameter,consider,na.action,fields
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Case.ID,Case.ID,,no,,Case.ID
cod,cod,,no,,cod
nationality,categorical,natEquivalences.csv,yes,all,Nat_PROC Nat_2_PROC
sex,categorical,,yes,all,Sex_PROC
age,range,6,yes,all,Age_PROC Age_2_PROC


In [71]:
schemes = pd.read_csv(f"{configPath}/select_schemes.csv", names=['schemes']) #TODO add option for "fast scheme"

schemesConfig = pd.read_csv(f"{configPath}/info_scheme.csv")
schemesConfig.rename(columns = {'listA.column': 'key1', 'listB.column': 'key2'}, inplace=True)
schemesConfig

Unnamed: 0,scheme,key1,key2,threshold
0,A_A,Name_A,Name_A,0.3
1,F_F,Father_name_A,Father_name_A,0.0
2,M_M,Mother_name_A,Mother_name_A,0.0
3,AD_AD,Adress_A,Adress_A,0.0
4,Ph_Ph,Phone_1_PROC Phone_2_PROC Phone_3_PROC,Phone_1_PROC Phone_2_PROC Phone_3_PROC,0.0


In [19]:
schemes

Unnamed: 0,schemes
0,A_A
1,F_F


In [75]:
schemesMeaning = {
    'A_A': textdistance.levenshtein.normalized_similarity, 
    'F_F': fuzz.ratio
                 }

# "Library"

In [6]:
def isNaN(value):
    if type(value)==str:
        return (value.upper() == 'NAN')
    else:
        return np.isnan(value)
    
def areEquivalentValues(val1, val2, equivalences):
    return (val1==val2 or (val1,val2) in equivalences) or ((val2,val1) in equivalences)

In [7]:
def compatibleRanges(row1, row2, options, verbose=False):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'] == 'all'
    
    #for debugging
    val1 = val2 = 'NAN'
    
    for field in options['fields'].split(' '):        
        if field not in row1 or field not in row2:
            continue
        
        if isNaN(row1[field]) or isNaN(row2[field]):
            compatible &= allowNa
        else:
            compatible &= abs(int(row1[field]) - int(row2[field])) <= int(options['parameter'])
            val1 = int(row1[field])
            val2 = int(row2[field])
            
    #for debugging
    if verbose: 
        print(f"\tval1:{val1}, val2:{val2}, allowNa: {allowNa}, range:{int(options['parameter'])}")
    
    return compatible

def compatibleCategory(row1, row2, options, verbose=False):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'].upper() == 'ALL'
    equivalences = None
    
    #for debugging
    val1 = val2 = 'NAN'
    
    if not isNaN(options['parameter']):
        equivalencesDF = pd.read_csv(f"{configPath}/{options['parameter']}")
        equivalences = list(equivalencesDF.itertuples(index=False, name=None))
        # WARNING, there may be a better way to do this other than creating tuples but we should have a scheme
        # to follow, in order to grab the columns without having to specify their names
    
    for field in options['fields'].split(' '):
        if field not in row1 or field not in row2:
            continue
        
        if isNaN(row1[field]) or isNaN(row2[field]):
            compatible &= allowNa
        elif equivalences:
            compatible &= areEquivalentValues(row1[field], row2[field], equivalences)
            val1 = row1[field]
            val2 = row2[field]
    
    #for debugging
    if verbose: 
        print(f"\tval1:{val1}, val2:{val2}, allowNa: {allowNa}")
    
    return compatible

In [8]:
def areCompatibles(row1, row2, verbose=False):
    compatible = True
        
    for variable, options in comparisonSettings.items():    
        if options['consider'].lower() == 'yes':
            if verbose:
                print(f"{variable} compatible?")
            varCompatible = True

            if options['type'] == 'range':
                varCompatible = compatibleRanges(row1, row2, options, verbose)
            elif options['type'] == 'categorical':
                varCompatible = compatibleCategory(row1, row2, options, verbose)

            compatible &= varCompatible
            if verbose:
                print(f"\tresult: {varCompatible}")
    if verbose:
        print(f"Candidates? {compatible} \n\n")
    return compatible

# Script

## Find compatible rows

In [9]:
compatiblesDF = base1.apply(lambda row1: base2.apply(lambda row2: areCompatibles(row1, row2), axis=1), axis=1)
i1, i2 = (compatiblesDF.values).nonzero()
compatibles = list(zip(i1, i2))

compatibles

[(2, 1), (2, 7), (4, 12), (4, 23), (5, 24)]

## Divide into compatible groups (aka equivalence clases)

In [13]:
compatibleGroups = {} #keys are index of base1, values are compatibles rows of base2

for i1, i2 in compatibles:
    if i1 not in compatibleGroups:
        compatibleGroups[i1] = []
    compatibleGroups[i1].append(i2)

compatibleGroups

{2: [1, 7], 4: [12, 23], 5: [24]}

## Use the schemes configured on the compatible groups

In [79]:
candidatesList = []

for i1, compatiblesBase2 in compatibleGroups.items():
    for i2 in compatiblesBase2:
        candidates = True
        
        for scheme, key1, key2, threshold in schemesConfig.itertuples(index=False):
            #check for valid scheme and keys
            if scheme not in schemesMeaning.keys() or key1 not in base1.columns or key2 not in base2.columns:
                print(f"invalid scheme: {scheme} or keys, key1: {key1}, key2: {key2}")
                continue
            
            value1 = base1.iloc[i1][key1]
            value2 = base2.iloc[i2][key2]
            
            candidates &= (schemesMeaning[scheme](value1, value2) < threshold)
            
            #TODO debug
        
        if candidates:
            candidatesList.append((i1,i2))

candidatesList            

invalid scheme: F_F or keys, key1: Father_name_A, key2: Father_name_A
invalid scheme: M_M or keys, key1: Mother_name_A, key2: Mother_name_A
invalid scheme: AD_AD or keys, key1: Adress_A, key2: Adress_A
invalid scheme: Ph_Ph or keys, key1: Phone_1_PROC Phone_2_PROC Phone_3_PROC, key2: Phone_1_PROC Phone_2_PROC Phone_3_PROC
invalid scheme: F_F or keys, key1: Father_name_A, key2: Father_name_A
invalid scheme: M_M or keys, key1: Mother_name_A, key2: Mother_name_A
invalid scheme: AD_AD or keys, key1: Adress_A, key2: Adress_A
invalid scheme: Ph_Ph or keys, key1: Phone_1_PROC Phone_2_PROC Phone_3_PROC, key2: Phone_1_PROC Phone_2_PROC Phone_3_PROC
invalid scheme: F_F or keys, key1: Father_name_A, key2: Father_name_A
invalid scheme: M_M or keys, key1: Mother_name_A, key2: Mother_name_A
invalid scheme: AD_AD or keys, key1: Adress_A, key2: Adress_A
invalid scheme: Ph_Ph or keys, key1: Phone_1_PROC Phone_2_PROC Phone_3_PROC, key2: Phone_1_PROC Phone_2_PROC Phone_3_PROC
invalid scheme: F_F or keys,

[]

In [74]:
    print(scheme, key1, key2, threshold)

A_A Name_A Name_A 0.3
F_F Father_name_A Father_name_A 0.0
M_M Mother_name_A Mother_name_A 0.0
AD_AD Adress_A Adress_A 0.0
Ph_Ph Phone_1_PROC Phone_2_PROC Phone_3_PROC Phone_1_PROC Phone_2_PROC Phone_3_PROC 0.0


In [72]:
schemesConfig

Unnamed: 0,scheme,key1,key2,threshold
0,A_A,Name_A,Name_A,0.3
1,F_F,Father_name_A,Father_name_A,0.0
2,M_M,Mother_name_A,Mother_name_A,0.0
3,AD_AD,Adress_A,Adress_A,0.0
4,Ph_Ph,Phone_1_PROC Phone_2_PROC Phone_3_PROC,Phone_1_PROC Phone_2_PROC Phone_3_PROC,0.0


# Only for checking

In [56]:
exBase1 = base1.iloc[2]
exBase2 = base2.iloc[1]
schemesMeaning['A_A'](exBase1.Name_1, exBase2.Name_1)

0.9

In [50]:
#TEST

#Stringdist (R) Functions
    #Levenshtein distance
print("levenshtein normalized similarity", textdistance.levenshtein.normalized_similarity('test', 'tast'))
    
    #D-L 
        #The restricted Damerau-Levenshtein distance is not a true distance metric because it does not 
        #satisfy the triangle inequality. This makes it a poor choice for applications that involve evaluating 
        #the similarity of more than two strings, such as clustering.
print("damerau-levenshtein normalized", textdistance.damerau_levenshtein.normalized_similarity('test', 'tast'))

    #D-L-FULL ???
    
    #LongestCommonSubstring #falta encontrar uno que haga tal cuál se quiere, sino se puede calcular, pero con cuidado
ex1 = 'Jonh Smit'
ex2 = 'John Smith'
substr = textdistance.lcsstr(ex1, ex2)
print(substr)
print("longest common substring", 1-len(substr)/(len(ex1)+len(ex2)))
    
    #GramX

#FuzzyWuzzyFunctions
    #Ratio
print("fuzz.ratio('test', 'tast')", fuzz.ratio('test', 'tast'))
    #partial_ratio
print("fuzz.partial_ratio('test', 'tast!')", fuzz.partial_ratio('test', 'tast!'))
print("fuzz.partial_ratio('test', 'test!')", fuzz.partial_ratio('test', 'test!'))
    #token_sort_ratio
print("fuzz.ratio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear')", fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"))
print("fuzz.token_sort_ratio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear')", fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"))
    #WRATIO == Set Ratio?
print("fuzz.token_sort_ratio('fuzzy was a bear', 'fuzzy fuzzy was a bear')", fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
print("fuzz.token_set_ratio('fuzzy was a bear', 'fuzzy fuzzy was a bear')", fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))

levenshtein normalized similarity 0.75
damerau-levenshtein normalized 0.75
 Smit
longest common substring 0.736842105263158
fuzz.ratio('test', 'tast') 75
fuzz.partial_ratio('test', 'tast!') 75
fuzz.partial_ratio('test', 'test!') 100
fuzz.ratio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear') 91
fuzz.token_sort_ratio('fuzzy wuzzy was a bear', 'wuzzy fuzzy was a bear') 100
fuzz.token_sort_ratio('fuzzy was a bear', 'fuzzy fuzzy was a bear') 84
fuzz.token_set_ratio('fuzzy was a bear', 'fuzzy fuzzy was a bear') 100


In [11]:
print(base1.shape)
base1

(6, 15)


Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA1,,BA1-01,Jorge Perez,Denmark,M,11,Missing,jorge perez,DENMARK,M,11,missing,jorge perez,jorge perez
1,BA1,,BA1-05,Oscar Alvarez,Jordan,M,44,Missing,oscar alvarez,JORDAN,M,44,missing,oscar alvarez,oscar alvarez
2,BA1,,BA1-06,Juan Perez,Mali,,15,Missing,juan perez,MALI,,15,missing,juan perez,juan perez
3,BA1,,BA1-07,Maria Urcupiña,Peru,F,18,Missing,maria urcupiña,PERU,F,18,missing,maria urcupiña,maria urcupiña
4,BA1,,BA1-11,Juana Morales,Yemen,F,44,Missing,juana morales,YEMEN,F,44,missing,juana morales,juana morales
5,BA1,,BA1-12,Esteban Quito,Seychelles,M,9,Missing,esteban quito,SEYCHELLES,M,9,missing,esteban quito,esteban quito


In [12]:
print(base2.shape)
base2.head(14)

(25, 15)


Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA2,,BA2-01,Aarón Ramirez,Morocco,M,20.0,,aaron ramirez,MOROCCO,M,20.0,,aaron ramirez,aaron ramirez
1,BA2,,BA2-02,Joan Perez,Mauritania,M,18.0,,joan perez,MAURITANIA,M,18.0,,joan perez,joan perez
2,BA2,,BA2-03,Marina Ruana,Nepal,F,44.0,,marina ruana,NEPAL,F,44.0,,marina ruana,marina ruana
3,BA2,,BA2-04,Carl Sorlovsky,Netherlands,M,50.0,,carl sorlovsky,NETHERLANDS,M,50.0,,carl sorlovsky,carl sorlovsky
4,BA2,,BA2-05,Mariano Hernandez,holand,M,78.0,,mariano hernandez,NETHERLANDS,M,78.0,,mariano hernandez,mariano hernandez
5,BA2,,BA2-06,Carla,Norway,F,10.0,,carla,NORWAY,F,10.0,,carla,carla
6,BA2,,BA2-07,John Buffalo,romania (stateless),,30.0,,john buffalo,ROMANIA,,30.0,,john buffalo,john buffalo
7,BA2,,BA2-02,Jose Perez,Mali,M,15.0,,jose perez,MALI,M,15.0,,jose perez,jose perez
8,BA2,,BA2-09,Alina Alvez,siera leone,F,,,alina alvez,SIERRA LEONE,F,,,alina alvez,alina alvez
9,BA2,,BA2-10,Inés Martinez,slovenia,F,42.0,,ines martinez,SLOVENIA,F,42.0,,ines martinez,ines martinez
