In [21]:
import pandas as pd
import numpy as np
import os

In [2]:
#config files
configPath = os.path.join(os.getcwd(), 'config')
dictionariesPath = os.path.join(os.getcwd(), 'dicts')

In [3]:
#input
inputPath = os.path.join(os.getcwd(), 'example/instances')

base1 = pd.read_csv(f"{inputPath}/inst1/base_inst1.csv")
base2 = pd.read_csv(f"{inputPath}/inst2/base_inst2.csv")

#output
outputPath = os.path.join(inputPath, 'matches')
if not os.path.exists(outputPath):
    os.makedirs(outputPath)
    
outputFileName = 'candidateList'

In [85]:
variableFields = pd.read_csv(f"{configPath}/variableFields.csv").set_index('variable')
compatibleData = pd.read_csv(f"{configPath}/compatible_data.csv").set_index('variable')

comparisonSettings = compatibleData.join(variableFields).to_dict(orient='index')

compatibleData.join(variableFields)

Unnamed: 0_level_0,type,parameter,consider,na.action,fields
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Case.ID,Case.ID,,no,,Case.ID
cod,cod,,no,,cod
nationality,categorical,natEquivalences.csv,yes,all,Nat_PROC Nat_2_PROC
sex,categorical,,yes,all,Sex_PROC
age,range,6,yes,all,Age_PROC Age_2_PROC


In [88]:
natEquivalences = pd.read_csv(f"{configPath}/natEquivalences.csv")
natEquivalences

Unnamed: 0,NatFROM,NatTO
0,MALI,MAURITANIA


In [75]:
base1rowExample = base1.loc[1]
base2rowExample = base2.loc[2]

print(base1rowExample)
print('\nother\n')
print(base2rowExample)

cod                      BA1
orden.base               NaN
Case.ID               BA1-05
Name_1         Oscar Alvarez
Nat                   Jordan
Sex                        M
Age                       44
Rol                  Missing
Name_1_PROC    oscar alvarez
Nat_PROC              JORDAN
Sex_PROC                   M
Age_PROC                  44
Rol_PROC             missing
Name_A         oscar alvarez
Name_B         oscar alvarez
Name: 1, dtype: object

other

cod                     BA2
orden.base              NaN
Case.ID              BA2-03
Name_1         Marina Ruana
Nat                   Nepal
Sex                       F
Age                      44
Rol                     NaN
Name_1_PROC    marina ruana
Nat_PROC              NEPAL
Sex_PROC                  F
Age_PROC                 44
Rol_PROC                NaN
Name_A         marina ruana
Name_B         marina ruana
Name: 2, dtype: object


In [99]:
def compatibleRanges(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'] == 'all'
    
    for field in options['fields'].split(' '):
        print(f"\tchecking field {field}")
        
        if not (field in row1 or field in row2):
            print(f"\t{field} not found")
            continue
        
        if not np.isnan(row1[field]) and not np.isnan(row2[field]):
            print(f"\tboth have field {field}")
            compatible &= abs(int(row1[field]) - int(row2[field])) <= int(options['parameter'])
        
        else:
            print("\tna.action")
            compatible &= allowNa
        
    return compatible

def compatibleCategory(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'] == 'all'
    equivalences = None
    
    if not options['parameter'] != 'NaN':
        equivalences = pd.read_csv(f"{configPath}/{options['parameter']}")
    
    for field in options['fields'].split(' '):
        print(f"\tchecking field {field}")
        
        if not (field in row1 or field in row2):
            print(f"\t{field} not found")
            continue
        
        if row1[field] != 'NaN' and row2[field] != 'NaN': # TODO find a better way to check for NaN on strings
            print(f"\tboth have field {field}")
            compatible &= (row1[field] == row2[field]) # TODO also check equivalence
        
        else:
            print("\tna.action")
            compatible &= allowNa
        
    return compatible
    

In [101]:
candidates = []

compatible = True

for variable, options in comparisonSettings.items():
    if options['consider'] == 'yes':
        varCompatible = True
        
        if options['type'] == 'range':
            varCompatible = compatibleRanges(base1rowExample, base2rowExample, options)
        elif options['type'] == 'categorical':
            varCompatible = compatibleCategory(base1rowExample, base2rowExample, options)
            
        compatible &= varCompatible
        print(f"{variable} compatible? {varCompatible}")
    
print(f"\nCandidates? {compatible}")


	checking field Nat_PROC
	both have field Nat_PROC
	checking field Nat_2_PROC
	Nat_2_PROC not found
nationality compatible? False
	checking field Sex_PROC
	both have field Sex_PROC
sex compatible? False
	checking field Age_PROC
	both have field Age_PROC
	checking field Age_2_PROC
	Age_2_PROC not found
age compatible? True

Candidates? False


## ---

In [26]:
base1.head(4)

Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA1,,BA1-01,Jorge Perez,Denmark,M,11,Missing,jorge perez,DENMARK,M,11,missing,jorge perez,jorge perez
1,BA1,,BA1-05,Oscar Alvarez,Jordan,M,44,Missing,oscar alvarez,JORDAN,M,44,missing,oscar alvarez,oscar alvarez
2,BA1,,BA1-06,Juan Perez,Mali,,15,Missing,juan perez,MALI,,15,missing,juan perez,juan perez
3,BA1,,BA1-07,Maria Urcupiña,Peru,F,18,Missing,maria urcupiña,PERU,F,18,missing,maria urcupiña,maria urcupiña


In [27]:
base2.head(4)

Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA2,,BA2-01,Aarón Ramirez,Morocco,M,20,,aaron ramirez,MOROCCO,M,20.0,,aaron ramirez,aaron ramirez
1,BA2,,BA2-02,Juan Bocacalle,marocco,M,23,,juan bocacalle,MOROCCO,M,23.0,,juan bocacalle,juan bocacalle
2,BA2,,BA2-03,Marina Ruana,Nepal,F,44,,marina ruana,NEPAL,F,44.0,,marina ruana,marina ruana
3,BA2,,BA2-04,Carl Sorlovsky,Netherlands,M,50,,carl sorlovsky,NETHERLANDS,M,50.0,,carl sorlovsky,carl sorlovsky
