# **Automatic Deduplication**

Deduplication of SIVEP-GRIPE stored in the warehouse using trained ML models.<br> 

# **Lib**

In [2]:
import os
import sys
sys.path.append( os.path.dirname(os.path.abspath('')) )

In [3]:
import json
import glob
import shutil
import joblib
import zipfile
from tqdm import tqdm
import numpy as np
import pandas as pd
import datetime as dt
from simpledbf import Dbf5

import matplotlib.pyplot as plt

PyTables is not installed. No support for HDF output.


In [4]:
from epimonitor import WarehouseSUS
from epimonitor.process_layer import ProcessSivep
from epimonitor.data_matching import Deduple

# **Database Connection**

In [8]:
datapath = os.path.join(os.environ["HOMEPATH"], "Documents", "data")
basepath = os.path.join(os.environ["HOMEPATH"], "Documents", "data", "SIVEP-GRIPE")
dbpath = os.path.join(datapath, "DATASUS_WAREHOUSE", "datasus_pessoas.db") # it shouldn't be here in this script
engine_url = f"sqlite:///{dbpath}"

warehouse = WarehouseSUS(engine_url)
engine = warehouse.db_init()

In [9]:
print(f"Tables: \n{list(warehouse.tables.keys())}")

Tables: 
['sivep_gripe', 'label_sivep_gripe']


In [10]:
nrecords = warehouse.number_of_records('sivep_gripe')
print(f"Total of records stored: {nrecords}")

Total of records stored: 100362


In [34]:
nrecords = warehouse.number_of_records('label_sivep_gripe')
print(f"Total of records stored: {nrecords}")

Total of records stored: 3631743


In [8]:
# -- delete table
#warehouse.delete_table('label_sivep_gripe', is_sure=True, authkey="###!Y!.")

In [35]:
len(warehouse.query_where('label_sivep_gripe', value=0.50, colname="PROBA_NEGATIVO_MODELO_3", condition='smaller'))

8685

# **Deduplication**

In [12]:
query_pairs = pd.DataFrame( warehouse.query_all(table_name='label_sivep_gripe') )
if query_pairs.shape[0]>0:
    query_pairs = query_pairs[["ID1", "ID2"]].copy()
else:
    query_pairs = []

In [13]:
query_pairs[:5]

Unnamed: 0,ID1,ID2
0,315968354539,31644621350086
1,316361280151,31649265573775
2,315894600446,315913747766
3,316179890440,316197135821
4,31655020136248,31659449463012


In [14]:
if len(query_pairs):
    query_pairs = list(query_pairs.itertuples(index=False, name=None))

In [15]:
period = (dt.datetime(2020, 1, 1), dt.datetime.today())
query_data = pd.DataFrame( warehouse.query_period(table_name='sivep_gripe', date_col="DATA_NOTIFICACAO", period=period) ).sample(frac=0.5)

In [16]:
print(query_data.info())
query_data.sample(n=4)

<class 'pandas.core.frame.DataFrame'>
Index: 49748 entries, 5769 to 67633
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ID_SIVEP              49748 non-null  object        
 1   DATA_NOTIFICACAO      49748 non-null  datetime64[ns]
 2   NOME_PACIENTE         49748 non-null  object        
 3   DATA_NASCIMENTO       49702 non-null  datetime64[ns]
 4   SEXO                  49748 non-null  object        
 5   NOME_MAE              48415 non-null  object        
 6   LOGRADOURO            48600 non-null  object        
 7   LOGRADOURO_NUMERO     46079 non-null  object        
 8   BAIRRO_RESIDENCIA     46975 non-null  object        
 9   MUNICIPIO_RESIDENCIA  49747 non-null  object        
 10  CEP                   29513 non-null  object        
 11  CNS                   12319 non-null  object        
 12  CPF                   26597 non-null  object        
 13  CNES              

Unnamed: 0,ID_SIVEP,DATA_NOTIFICACAO,NOME_PACIENTE,DATA_NASCIMENTO,SEXO,NOME_MAE,LOGRADOURO,LOGRADOURO_NUMERO,BAIRRO_RESIDENCIA,MUNICIPIO_RESIDENCIA,CEP,CNS,CPF,CNES,DATA_SINTOMAS,CLASSIFICACAO_FINAL,CRITERIO,EVOLUCAO,CRIADO_EM,ATUALIZADO_EM
41053,316167898058,2021-03-26,GILBERTO GOMES DA SILVA,1966-03-24,M,ROSA GONCALVES DA SILVA,TATIANA,38,VILA VELHA,230440,60349385.0,706008311728244.0,,2529149,2021-03-23,5.0,1.0,1.0,2023-09-05 13:16:15.183919,2023-09-05 13:16:15.183919
28654,316118549218,2021-01-12,KEVIN ERIK SILVA LOIOLA PINHEIRO,2012-12-13,M,MARIA ERIDAM SILVA LOIOLA,RUA FRANCISCO PASTOR,136,CARAPIOCA,230625,,,,2563681,2021-01-11,4.0,1.0,1.0,2023-09-05 13:16:12.960028,2023-09-05 13:16:12.960028
38711,316160338422,2021-03-17,JOSE WELTON ANGELO SOUSA,1983-04-21,M,ANTONIA MARIA ANGELO SOUSA,RUA CARNEIRO DA CUNHA,477,,230440,,,,7434308,2021-03-11,,,,2023-09-05 13:16:14.668067,2023-09-05 13:16:14.668067
84566,31654476667626,2022-06-05,DEIVID LEVY DA SILVA BEZERRA,2022-02-13,M,SAMIRA DA SILVA BRITO,RUA ESTUDANTE JUCA,2,SERINHA,230440,,,12417265311.0,2526638,2022-06-04,,,,2023-09-05 13:16:55.338362,2023-09-05 13:16:55.338362


## **Process Sivep**

In [17]:
processor = ProcessSivep(query_data, 'ID_SIVEP')

In [18]:
processor.basic_standardize().specific_standardize()

In [19]:
processed_data = processor.data.copy()
processed_data.sample(n=4)

Unnamed: 0,ID_SIVEP,NOME_PACIENTE,NOME_MAE,DATA_NASCIMENTO,primeiro_nome,complemento_nome,primeiro_nome_mae,complemento_nome_mae,nascimento_dia,nascimento_mes,...,FONETICA_N,norm_primeiro_nome,rank_primeiro_nome,norm_primeiro_nome_mae,rank_primeiro_nome_mae,sexo,cpf,cns,bairro,cep
25633,316415792398,BENONY CUNHA GALVAO,MARIANA ALVES DA CUNHA,1948-08-14,BENONY,CUNHA GALVAO,MARIANA,ALVES DA CUNHA,14.0,8.0,...,BENONYGALVAO,4e-05,2,0.001648,4,M,,708200129598745.0,PANAMERICANO,
42338,315980078574,FRANCISCO CLECIO DE SOUZA,MARIA CILEIDE DE SOUZA,1977-02-10,FRANCISCO,CLECIO DE SOUZA,MARIA,CILEIDE DE SOUZA,10.0,2.0,...,FRANCISCOSOUZA,0.068385,6,0.320576,7,M,75642786334.0,,SAO BENTO,60875135.0
14084,316315394449,MARIA EDILMA DA SILVA CASTRO,MARIA PEDRO DA SILVA,1936-08-15,MARIA,EDILMA DA SILVA CASTRO,MARIA,PEDRO DA SILVA,15.0,8.0,...,MARIACASTRO,0.148468,7,0.320576,7,F,46965610387.0,,ITAPERI,60714390.0
19646,316269814434,OSMARINO RODRIGUES DE SOUSA,QUERINA LOIOLA MAIA SOUSA,1942-03-07,OSMARINO,RODRIGUES DE SOUSA,QUERINA,LOIOLA MAIA SOUSA,7.0,3.0,...,OSMARINOSOUSA,4e-05,2,4e-05,2,M,26631512387.0,700004398455803.0,CENTRO,63660000.0


**TOP 10 names**

In [20]:
processor.freq_names[:10]

Unnamed: 0,primeiro_nome,norm_primeiro_nome,rank_primeiro_nome,primeiro_nome_mae,norm_primeiro_nome_mae,rank_primeiro_nome_mae
0,MARIA,0.148468,7,MARIA,0.320576,7
1,FRANCISCO,0.068385,6,FRANCISCA,0.076727,6
2,JOSE,0.06354,6,RAIMUNDA,0.033408,6
3,ANTONIO,0.032142,6,ANTONIA,0.029489,6
4,JOAO,0.025991,6,ANA,0.029227,6
5,ANA,0.02587,6,RITA,0.010191,5
6,FRANCISCA,0.025147,6,LUIZA,0.008322,5
7,RAIMUNDO,0.015357,5,JOANA,0.007739,5
8,PEDRO,0.012201,5,JOSEFA,0.007116,5
9,ANTONIA,0.011016,5,ROSA,0.00603,5


## **Deduple**

**Load classifiers**

In [21]:
gbt_model = joblib.load(os.path.join(basepath, "TRAINED_MODELS", "GRADBOOST_SIVEP04SET2023.joblib"))
rnf_model = joblib.load(os.path.join(basepath, "TRAINED_MODELS", "RANDFOREST_SIVEP04SET2023.joblib"))
lgt_model = joblib.load(os.path.join(basepath, "TRAINED_MODELS", "LOGITREG_SIVEP04SET2023.joblib"))

**Prepare similarity matrix**

In [22]:
#dedupe_path = os.path.join(datapath, "SIVEP-GRIPE", "TRAINING_DATA_CLASSIFIER", "DEDUPE_FILES")

deduple = Deduple(processed_data, left_id="ID_SIVEP", env_folder=None)
map_compare = {
    "cns": ["exact"], "cep": ["exact"], "cpf": ["exact"], "sexo": ["exact"],
    "nascimento_dia": ["exact"], "nascimento_mes": ["exact"], "nascimento_ano": ["exact"],
    "primeiro_nome_mae": ["string", None], "complemento_nome_mae": ["string", None],
    "primeiro_nome": ["string", None], 
    "complemento_nome": ["string", None], "bairro": ["string", None]
}

deduple.set_linkage(map_compare, string_method="damerau_levenshtein").define_pairs("FONETICA_N", window=3)

Number of pairs: 1665947


<epimonitor.data_matching.matching_data.Deduple at 0x1dcd5215490>

In [23]:
#index0 = [(1,2), (1,3), (3,1)]
#pd.MultiIndex.from_tuples(index0, names=["ID_SIVEP_1", "ID_SIVEP_2"])

#list({*map(tuple, map(sorted, index0))})

#self.candidate_pairs = pd.MultiIndex.from_tuples(  )
#list(deduple.candidate_pairs)

In [24]:
# -- remove pairs that were already classified before
deduple.candidate_pairs = deduple.candidate_pairs.drop(query_pairs, errors='ignore')
print(deduple.candidate_pairs.shape[0])
deduple.candidate_pairs[:10]

1053613


MultiIndex([(  '316420850322', '31665084311594'),
            (  '316158191999',   '316333573427'),
            (  '316192848973', '31657550865734'),
            (  '315912761636', '31642618704284'),
            (  '315881342251',   '316309277782'),
            ('31670592168274', '31679951223873'),
            (  '316179110346',   '316420696523'),
            (  '315886168941',   '315960305534'),
            (  '315862891614',   '316268880129'),
            (  '315978330035',   '316182446462')],
           names=['ID_SIVEP_1', 'ID_SIVEP_2'])

In [25]:
deduple.perform_linkage("FONETICA_N", window=3, threshold=0.60)

In [26]:
ranks = processed_data.set_index('ID_SIVEP')[["rank_primeiro_nome", "rank_primeiro_nome_mae"]]
deduple._comparison_matrix = deduple.comparison_matrix.merge(ranks, left_on=["ID_SIVEP_1"], right_index=True, how="left")
deduple._comparison_matrix["rank_primeiro_nome_mae"] = deduple._comparison_matrix["rank_primeiro_nome_mae"].fillna(7)

In [27]:
deduple.comparison_matrix.sample(n=5)

Unnamed: 0_level_0,Unnamed: 1_level_0,cns,cep,cpf,sexo,nascimento_dia,nascimento_mes,nascimento_ano,primeiro_nome_mae,complemento_nome_mae,primeiro_nome,complemento_nome,bairro,rank_primeiro_nome,rank_primeiro_nome_mae
ID_SIVEP_1,ID_SIVEP_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
316021772672,316111706330,0,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,7,4
316178895123,31663867763817,0,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,5,4
315889495649,31682804060221,0,0,0,1,0,0,0,1.0,0.0,1.0,0.0,0.0,7,7
315910401539,316178919079,0,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,7,6
316139193334,31643032112118,0,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,7,6


**Classify pairs**

In [28]:
matrix_pairs = deduple.comparison_matrix.reset_index().copy()
pair_ids, X_sel = matrix_pairs.iloc[:,:2],  matrix_pairs.iloc[:,2:].values

In [29]:
batchsize = 6000
Y_neg1, Y_neg2, Y_neg3 = [], [], []
for batch in tqdm(np.array_split(X_sel, np.arange(batchsize, X_sel.shape[0]+1, batchsize))):
    Y_neg1 += [ res[0] for res in gbt_model.predict_proba(batch) ]
    Y_neg2 += [ res[0] for res in rnf_model.predict_proba(batch) ]
    Y_neg3 += [ res[0] for res in lgt_model.predict_proba(batch) ]

100%|████████████████████████████████████████████████████████████████████████████████| 176/176 [00:23<00:00,  7.44it/s]


In [31]:
pair_ids["FMT_PKEY"] = pair_ids["ID_SIVEP_1"] + "-" + pair_ids["ID_SIVEP_2"]
pair_ids["PROBA_NEGATIVO_MODELO_1"] = Y_neg1
pair_ids["PROBA_NEGATIVO_MODELO_2"] = Y_neg2
pair_ids["PROBA_NEGATIVO_MODELO_3"] = Y_neg3
#pair_ids["LABEL_MODELO_1"] = pair_ids["PROBA_NEGATIVO_MODELO_1"].apply(lambda x: 0 if x>=0.50 else 1)
#pair_ids["LABEL_MODELO_2"] = pair_ids["PROBA_NEGATIVO_MODELO_2"].apply(lambda x: 0 if x>=0.50 else 1)
#pair_ids["LABEL_MODELO_3"] = pair_ids["PROBA_NEGATIVO_MODELO_3"].apply(lambda x: 0 if x>=0.50 else 1)

In [32]:
pair_ids

Unnamed: 0,ID_SIVEP_1,ID_SIVEP_2,FMT_PKEY,PROBA_NEGATIVO_MODELO_1,PROBA_NEGATIVO_MODELO_2,PROBA_NEGATIVO_MODELO_3
0,316420850322,31665084311594,316420850322-31665084311594,0.999954,0.999714,1.000000
1,316158191999,316333573427,316158191999-316333573427,0.999954,0.999742,1.000000
2,316192848973,31657550865734,316192848973-31657550865734,0.999928,0.998082,1.000000
3,315912761636,31642618704284,315912761636-31642618704284,0.999954,0.995613,1.000000
4,315881342251,316309277782,315881342251-316309277782,0.999954,0.999742,1.000000
...,...,...,...,...,...,...
1053608,315924127454,31655183873441,315924127454-31655183873441,0.999928,0.998082,1.000000
1053609,315977588012,316184083012,315977588012-316184083012,0.999928,0.995841,0.999999
1053610,316037215205,316158167791,316037215205-316158167791,0.999954,0.999745,1.000000
1053611,315925981683,316247091982,315925981683-316247091982,0.999954,0.999742,1.000000


## Inject compared pairs in the warehouse

In [1]:
#warehouse.insert('label_sivep_gripe', pair_ids, batchsize=1000, verbose=True)

In [30]:
#warehouse.delete_table('label_sivep_gripe', is_sure=True, authkey="###!Y!.")