In [1]:
import pandas as pd
import numpy as np
import os

# Config

In [2]:
#config files
configPath = os.path.join(os.getcwd(), 'config')
dictionariesPath = os.path.join(os.getcwd(), 'dicts')

In [3]:
#input
inputPath = os.path.join(os.getcwd(), 'example/instances')

base1 = pd.read_csv(f"{inputPath}/inst1/base_inst1.csv")
base2 = pd.read_csv(f"{inputPath}/inst2/base_inst2.csv")

#output
outputPath = os.path.join(inputPath, 'matches')
if not os.path.exists(outputPath):
    os.makedirs(outputPath)
    
outputFileName = 'candidateList'

In [4]:
variableFields = pd.read_csv(f"{configPath}/variableFields.csv").set_index('variable')
compatibleData = pd.read_csv(f"{configPath}/compatible_data.csv").set_index('variable')

comparisonSettings = compatibleData.join(variableFields).to_dict(orient='index')

compatibleData.join(variableFields)

Unnamed: 0_level_0,type,parameter,consider,na.action,fields
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Case.ID,Case.ID,,no,,Case.ID
cod,cod,,no,,cod
nationality,categorical,natEquivalences.csv,yes,all,Nat_PROC Nat_2_PROC
sex,categorical,,yes,all,Sex_PROC
age,range,6,yes,all,Age_PROC Age_2_PROC


In [28]:
schemes = pd.read_csv(f"{configPath}/select_schemes.csv") #TODO add option for "fast scheme"
schemesConfig = pd.read_csv(f"{configPath}/info_scheme.csv")

schemesConfig

Unnamed: 0,scheme,listA.column,listB.column,threshold
0,A_A,Name_A,Name_A,0.3
1,F_F,Father_name_A,Father_name_A,0.0
2,M_M,Mother_name_A,Mother_name_A,0.0
3,AD_AD,Adress_A,Adress_A,0.0
4,Ph_Ph,Phone_1_PROC Phone_2_PROC Phone_3_PROC,Phone_1_PROC Phone_2_PROC Phone_3_PROC,0.0


# "Library"

In [48]:
# def equivalentNationalities(nat1, nat2):
#     return (nat1==nat2) or ((natEquivalences.NatFROM == nat1) & (natEquivalences.NatTO == nat2)).any() or ((natEquivalences.NatFROM == nat2) & (natEquivalences.NatTO == nat1)).any()

def isNaN(value):
    if type(value)==str:
        return (value.upper() == 'NAN')
    else:
        return np.isnan(value)
    
def areEquivalentValues(val1, val2, equivalences):
    return ((val1,val2) in equivalences) or ((val2,val1) in equivalences)

In [59]:
def compatibleRanges(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'] == 'all'
    
    #for debugging
    val1 = val2 = 'NAN'
    
    for field in options['fields'].split(' '):        
        if field not in row1 or field not in row2:
            continue
        
        if not np.isnan(row1[field]) and not np.isnan(row2[field]):
            compatible &= abs(int(row1[field]) - int(row2[field])) <= int(options['parameter'])
            val1 = int(row1[field])
            val2 = int(row2[field])
        else:
            compatible &= allowNa
    
    #for debugging
#     print(f"\tval1:{val1}, val2:{val2}, allowNa: {allowNa}, range:{int(options['parameter'])}")
    
    return compatible

def compatibleCategory(row1, row2, options):
    compatible = True #need to be compatible on all available fields (can be changed for 'or')
    allowNa = options['na.action'].upper() == 'ALL'
    equivalences = None
    
    #for debugging
    val1 = val2 = 'NAN'
    
    if not isNaN(options['parameter']):
        equivalencesDF = pd.read_csv(f"{configPath}/{options['parameter']}")
        equivalences = list(equivalencesDF.itertuples(index=False, name=None))
        # WARNING, there may be a better way to do this other than creating tuples but we should have a scheme
        # to follow, in order to grab the columns without having to specify their names
    
    for field in options['fields'].split(' '):
        if field not in row1 or field not in row2:
            continue
        
        if isNaN(row1[field]) or isNaN(row2[field]):
            compatible &= allowNa
        elif equivalences:
            compatible &= areEquivalentValues(row1[field], row2[field], equivalences)
            val1 = row1[field]
            val2 = row2[field]
    
    #for debugging
#     print(f"\tval1:{val1}, val2:{val2}, allowNa: {allowNa}")
    
    return compatible

# Script

## Find compatible rows

In [74]:
def areCompatibles(row1, row2):
    compatible = True
        
    for variable, options in comparisonSettings.items():    
        if options['consider'].lower() == 'yes':
#             print(f"{variable} compatible?")
            varCompatible = True

            if options['type'] == 'range':
                varCompatible = compatibleRanges(row1, row2, options)
            elif options['type'] == 'categorical':
                varCompatible = compatibleCategory(row1, row2, options)

            compatible &= varCompatible
#             print(f"\tresult: {varCompatible}")
#     print(f"Candidates? {compatible} \n\n")
    return compatible

compatibles = base1.apply(lambda row1: base2.apply(lambda row2: areCompatibles(row1, row2), axis=1), axis=1)
# TODO compares all vs all the rest, we could keep indexes and check only agains all not checked
# but in order to do that we may need to use iterrows which is more expensive and it's usually frowned upon

compatibles.sum(axis=1)

0    0
1    0
2    1
3    0
4    0
5    0
dtype: int64

In [79]:
# Create compatible groups from compatibles info

## Use the schemes configured on the compatible groups

# Only for checking

In [76]:
print(base1.shape)
base1.head(2)

(6, 15)


Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA1,,BA1-01,Jorge Perez,Denmark,M,11,Missing,jorge perez,DENMARK,M,11,missing,jorge perez,jorge perez
1,BA1,,BA1-05,Oscar Alvarez,Jordan,M,44,Missing,oscar alvarez,JORDAN,M,44,missing,oscar alvarez,oscar alvarez


In [78]:
print(base2.shape)
base2.head(3)

(25, 15)


Unnamed: 0,cod,orden.base,Case.ID,Name_1,Nat,Sex,Age,Rol,Name_1_PROC,Nat_PROC,Sex_PROC,Age_PROC,Rol_PROC,Name_A,Name_B
0,BA2,,BA2-01,Aarón Ramirez,Morocco,M,20,,aaron ramirez,MOROCCO,M,20.0,,aaron ramirez,aaron ramirez
1,BA2,,BA2-02,Jprgw Perez,Mauritania,M,18,,jprgw perez,MAURITANIA,M,18.0,,jprgw perez,jprgw perez
2,BA2,,BA2-03,Marina Ruana,Nepal,F,44,,marina ruana,NEPAL,F,44.0,,marina ruana,marina ruana
