In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#config files
configPath = os.path.join(os.getcwd(), 'config')
dictionariesPath = os.path.join(os.getcwd(), 'dicts')

In [29]:
#input
inputPath = os.path.join(os.getcwd(), 'example/instances')

base1 = pd.read_csv(f"{inputPath}/inst1/base_inst1.csv")
base2 = pd.read_csv(f"{inputPath}/inst2/base_inst2.csv")

#output
outputPath = os.path.join(inputPath, 'matches')
if not os.path.exists(outputPath):
    os.makedirs(outputPath)
    
outputFileName = 'candidateList'

In [30]:
variableFields = pd.read_csv(f"{configPath}/variableFields.csv").set_index('variable')
compatibleData = pd.read_csv(f"{configPath}/compatible_data.csv").set_index('variable')

comparisonSettings = compatibleData.join(variableFields).to_dict(orient='index')

compatibleData.join(variableFields)

Unnamed: 0_level_0,type,parameter,consider,na.action,fields
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Case.ID,Case.ID,,no,,Case.ID
cod,cod,,no,,cod
nationality,categorical,natEquivalences.csv,yes,all,Nat_PROC Nat_2_PROC
sex,categorical,,yes,all,Sex_PROC
age,range,6,yes,all,Age_PROC Age_2_PROC


In [31]:
natEquivalences = pd.read_csv(f"{configPath}/natEquivalences.csv")
natEquivalences

Unnamed: 0,NatFROM,NatTO
0,MALI,MAURITANIA


In [75]:
base1rowExample = base1.loc[2]
base2rowExample = base2.loc[1]

print(base1rowExample)
print('\nother\n')
print(base2rowExample)

cod                   BA1
orden.base            NaN
Case.ID            BA1-06
Name_1         Juan Perez
Nat                  Mali
Sex                   NaN
Age                    15
Rol               Missing
Name_1_PROC    juan perez
Nat_PROC             MALI
Sex_PROC              NaN
Age_PROC               15
Rol_PROC          missing
Name_A         juan perez
Name_B         juan perez
Name: 2, dtype: object

other

cod                    BA2
orden.base             NaN
Case.ID             BA2-02
Name_1         Jprgw Perez
Nat             Mauritania
Sex                      M
Age                     18
Rol                    NaN
Name_1_PROC    jprgw perez
Nat_PROC        MAURITANIA
Sex_PROC                 M
Age_PROC                18
Rol_PROC               NaN
Name_A         jprgw perez
Name_B         jprgw perez
Name: 1, dtype: object


In [54]:
def equivalentNationalities(nat1, nat2):
    return (nat1==nat2) or ((natEquivalences.NatFROM == nat1) & (natEquivalences.NatTO == nat2)).any() or ((natEquivalences.NatFROM == nat2) & (natEquivalences.NatTO == nat1)).any()

def isNaN(value):
    if type(value)==str:
        return (value.upper() == 'NAN')
    else:
        return np.isnan(value)

In [70]:
def compatibleRanges(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'] == 'all'
    
    for field in options['fields'].split(' '):        
        if not (field in row1 or field in row2):
            continue
        
        if not np.isnan(row1[field]) and not np.isnan(row2[field]):
            compatible &= abs(int(row1[field]) - int(row2[field])) <= int(options['parameter'])
        else:
            compatible &= allowNa
        
    return compatible

def compatibleCategory(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'].upper() == 'ALL'
    equivalences = None
    
    if not options['parameter'] != 'NaN':
        equivalences = pd.read_csv(f"{configPath}/{options['parameter']}")
    
    for field in options['fields'].split(' '):
        if not (field in row1 or field in row2):
            continue
        
        if isNaN(row1[field]) or isNaN(row2[field]):
            compatible &= allowNa
        else:
            compatible &= equivalentNationalities(row1[field], row2[field])
        
    return compatible

In [71]:
candidates = []

compatible = True

for variable, options in comparisonSettings.items():
    if options['consider'] == 'yes':
        varCompatible = True
        
        if options['type'] == 'range':
            varCompatible = compatibleRanges(base1rowExample, base2rowExample, options)
        elif options['type'] == 'categorical':
            varCompatible = compatibleCategory(base1rowExample, base2rowExample, options)
            
        compatible &= varCompatible
        print(f"{variable} compatible? {varCompatible}")
    
print(f"\nCandidates? {compatible}")


nationality compatible? True
sex compatible? True
age compatible? True

Candidates? True


## ---

In [73]:
base1.head(2)

Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA1,,BA1-01,Jorge Perez,Denmark,M,11,Missing,jorge perez,DENMARK,M,11,missing,jorge perez,jorge perez
1,BA1,,BA1-05,Oscar Alvarez,Jordan,M,44,Missing,oscar alvarez,JORDAN,M,44,missing,oscar alvarez,oscar alvarez


In [74]:
base2.head(3)

Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA2,,BA2-01,Aarón Ramirez,Morocco,M,20,,aaron ramirez,MOROCCO,M,20.0,,aaron ramirez,aaron ramirez
1,BA2,,BA2-02,Jprgw Perez,Mauritania,M,18,,jprgw perez,MAURITANIA,M,18.0,,jprgw perez,jprgw perez
2,BA2,,BA2-03,Marina Ruana,Nepal,F,44,,marina ruana,NEPAL,F,44.0,,marina ruana,marina ruana
