In [1]:
from utils import setrootdir
setrootdir("ppgcc-coautorias")

'Directory ppgcc-coautorias successfully loaded as current working directory.'

In [59]:
import os
import xml.etree.ElementTree as et
import pickle
from pathlib import Path
from tqdm import tqdm
import re
import unicodedata

from dotenv import load_dotenv

import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on

from src.explattes.Pesquisador import Pesquisador

# 2. Preprocessing

In [3]:
load_dotenv()

DATASET_DIRECTORY = os.getenv("DATASET_DIRECTORY")

## 2.1. DataFrames construction

In [4]:
curriculum_dataset_directory = Path(DATASET_DIRECTORY, "01-selection", "curriculolattesprofessoresppgccbrasil")

resumes = {}

for subdir, dirs, files in os.walk(curriculum_dataset_directory):
    for file in files:
        
        filepath = Path(subdir, file)

        institution = filepath.parent.name

        if institution not in resumes:
            resumes[institution] = []

        root = et.parse(filepath).getroot()
        researcher = Pesquisador(root)
        resumes[institution].append(researcher)

In [5]:
data_resumes = {
    "name": [],
    "citation": [],
    "lattes_id": [],
    "institution": [],
    "production": [],
    "authors": [],
    "location": [],
    "type": [],
    "year": [],
    "issn": [],
}

for institution, researchers in resumes.items():
    for researcher in researchers:
        for producao in researcher.producoes:
            data_resumes["name"].append(researcher.nome)
            data_resumes["citation"].append(researcher.citacoes)
            data_resumes["lattes_id"].append(researcher.id)
            data_resumes["institution"].append(institution)
            data_resumes["production"].append(producao.titulo)
            data_resumes["authors"].append(producao.autores)
            data_resumes["location"].append(producao.local)
            data_resumes["type"].append(producao.tipo)
            data_resumes["year"].append(producao.ano)
            data_resumes["issn"].append(producao.issn)

df_productions = pd.DataFrame(data_resumes)
df_productions = df_productions.reset_index(names="production_id")
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos R. S. Borges]",International Journal of Computer Applications...,LocalTipo.PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,LocalTipo.PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,LocalTipo.PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",LocalTipo.PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,LocalTipo.PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230926,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,LocalTipo.CONFERENCIA,2022,
230927,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., PAILLARD, GABRIEL A. L., Leand...",2023 IEEE International Parallel and Distribut...,LocalTipo.CONFERENCIA,2023,
230928,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,
230929,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,


## 2.3. Productions with no authors

In [47]:
df_productions[df_productions["authors"].map(len) < 1]

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
63068,63068,Adriano Alonso Veloso,"[Adriano Alonso Veloso, VELLOSO, ADRIANO, VELL...",9973021912226739,UFMG,"Parallel, Incremental and Interactive Mining f...",[],6th International Workshop on High Performance...,LocalTipo.CONFERENCIA,2003,
63070,63070,Adriano Alonso Veloso,"[Adriano Alonso Veloso, VELLOSO, ADRIANO, VELL...",9973021912226739,UFMG,Efficient Data Mining for Frequent Itemsets in...,[],17o Concurso de Teses e Dissertações,LocalTipo.CONFERENCIA,2004,
63071,63071,Adriano Alonso Veloso,"[Adriano Alonso Veloso, VELLOSO, ADRIANO, VELL...",9973021912226739,UFMG,Rule Generation and Rule Selection Techniques ...,[],Brazilian Simposyum on Databases,LocalTipo.CONFERENCIA,2005,
150950,150950,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Dynamic radiation force of acoustic waves on s...,[],Physical Review E. (Cessou em 2000. Cont. 1539...,LocalTipo.PERIODICO,2006,1063651X
150956,150956,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Dynamic radiation force of acoustic waves on a...,[],Brazilian Journal of Physics (Impresso),LocalTipo.PERIODICO,2010,01039733
151008,151008,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Dynamic ultrasound radiation force in viscous ...,[],1st LNCC Meeting on Computational Modelling,LocalTipo.RESUMO,2004,
151016,151016,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Finite-difference time-domain approach to acou...,[],150th Acoustical Society of America Meeting,LocalTipo.RESUMO,2005,
151018,151018,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Dynamic radiation force of acoustic waves on s...,[],4th Joint Meeting of the Acoustical Society of...,LocalTipo.RESUMO,2006,
151026,151026,Glauber Jose Ferreira Tomaz da Silva,"[Glauber Jose Ferreira Tomaz da Silva, GLAUBER...",8964946246608547,UFAL,Dynamic radiation force of acoustic waves,[],Biomedical Applications of Vibration and Acous...,LocalTipo.CAPITULO_LIVRO,2008,
178287,178287,Sabrina dos Santos Marczak,"[Sabrina dos Santos Marczak, MARCZAK, SABRINA,...",9458496222461501,PUC-RS,Towards a Theory of Knowledge Management Proce...,[],International Requirements Engineering Confere...,LocalTipo.CONFERENCIA,2008,


In [48]:
df_productions = df_productions[df_productions["authors"].map(len) > 0].reset_index(drop=True)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos Roberto da Sil...",International Journal of Computer Applications...,LocalTipo.PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,LocalTipo.PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,LocalTipo.PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",LocalTipo.PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,LocalTipo.PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230912,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,LocalTipo.CONFERENCIA,2022,
230913,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., Gabriel Antoine Louis Paillard...",2023 IEEE International Parallel and Distribut...,LocalTipo.CONFERENCIA,2023,
230914,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,
230915,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,


## 2.3. Citation normalization

In [49]:
df_citations = df_productions.drop_duplicates(subset=["lattes_id"])[["name", "citation"]].reset_index(drop=True)
df_citations

Unnamed: 0,name,citation
0,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO..."
1,Saulo Moraes Villela,"[Saulo Moraes Villela, VILLELA, SAULO MORAES, ..."
2,Raul Fonseca Neto,"[Raul Fonseca Neto, NETO, RAUL FONSECA, R. FON..."
3,Heder Soares Bernardino,"[Heder Soares Bernardino, Bernardino, Heder S...."
4,Marcelo Bernardes Vieira,"[Marcelo Bernardes Vieira, M. B. Vieira, M.B. ..."
...,...,...
1752,Daniela Gorski Trevisan,"[Daniela Gorski Trevisan, D. G. TREVISAN, DANI..."
1753,Aura Conci,"[Aura Conci, CONCI, A., CONCI, AURA, Conci, A...."
1754,Flavia Cristina Bernardini,"[Flavia Cristina Bernardini, F. BERNARDINI, Fl..."
1755,Luciana Cardoso de Castro Salgado,"[Luciana Cardoso de Castro Salgado, LUCIANA CA..."


In [50]:
citation_variations = [name for citations in df_citations["citation"] for name in citations]

print(f"There is a total of {len(citation_variations)} citation variations.")

variations_counting = {}

for citation in citation_variations:
    if citation not in variations_counting.keys():
        variations_counting[citation] = 0
    variations_counting[citation] += 1

non_duplicated_variations = [citation for citation, count in variations_counting.items() if count == 1]
non_duplicated_variations = set(non_duplicated_variations)

print(f"There is a total of {len(non_duplicated_variations)} non-duplicated citation variations.")

There is a total of 34480 citation variations.
There is a total of 34274 non-duplicated citation variations.


In [51]:
citations_reference = {}

for i, row in df_citations.iterrows():
    for citation in row["citation"]:
        if citation in non_duplicated_variations:
            citations_reference[citation] = row["name"]

print(len(citations_reference))

34274


In [52]:
def normalize_citations(authors):
    normalized_authors = []
    for author in authors:
        if author in non_duplicated_variations:
            normalized_authors.append(citations_reference[author])
        else:
            normalized_authors.append(author)
    return normalized_authors

df_productions["authors"] = df_productions["authors"].apply(normalize_citations)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos Roberto da Sil...",International Journal of Computer Applications...,LocalTipo.PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,LocalTipo.PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,LocalTipo.PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",LocalTipo.PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JÓSE MARIA, JO...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,LocalTipo.PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230912,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,LocalTipo.CONFERENCIA,2022,
230913,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., Gabriel Antoine Louis Paillard...",2023 IEEE International Parallel and Distribut...,LocalTipo.CONFERENCIA,2023,
230914,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,
230915,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, SANTIAGO DE ARAUJ...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,LocalTipo.CONFERENCIA,2023,


In [93]:
new_df_exploded = df_productions.explode("authors").reset_index(drop=True)
new_df_exploded["authors"].unique()

array(['Jose Maria Nazar David', 'Marcos Roberto da Silva Borges',
       'Rita Suzana Pitangueira Maciel', ..., 'BIANCHINI, CALEBE',
       'Carlos Thadeu Duarte Santos', 'Thiago Lopes Nascimento'],
      shape=(186133,), dtype=object)

## 2.4. Ambiguous citations

In [53]:
df_citations_exploded = df_productions[["production_id", "authors"]].explode("authors").reset_index(drop=True)
df_citations_exploded

Unnamed: 0,production_id,authors
0,0,Jose Maria Nazar David
1,0,Marcos Roberto da Silva Borges
2,1,Rita Suzana Pitangueira Maciel
3,1,Jose Maria Nazar David
4,1,Michel Oei
...,...,...
936713,230929,Luís Felipe Ignácio Cunha
936714,230930,Thiago Lopes Nascimento
936715,230930,Fábio Protti
936716,230930,Luís Felipe Ignácio Cunha


In [54]:
df_ambiguous_citations = df_citations_exploded[~df_citations_exploded["authors"].isin(non_duplicated_variations)]
df_ambiguous_citations

Unnamed: 0,production_id,authors
4,1,Michel Oei
5,1,Adriano Bastos
6,1,Leandro Menezes
8,2,Fernanda Cláudia Alves Campos
11,2,Leonardo Guerreiro Azevedo
...,...,...
936705,230927,"DINIZ, PEDRO C."
936707,230928,Carlos Thadeu Duarte Santos
936708,230928,Anderson Zudio
936711,230929,Felipe Souza


In [87]:
name_patterns = {
    "Single": r"^[A-Z][a-z]+$",
    "First Last": r"^[A-Z][a-z]+ [A-Z][a-z]+$",
    "First Middle Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+$",
    "First Various Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ [A-Z][a-z]+$",
    "First M. Last": r"^[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+$",
    "First V. Last": r"^[A-Z][a-z]+( [A-Z]\.)+( [A-Z][a-z]+)$",
    "First M. Various Last": r"^[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "First V. Various Last": r"^[A-Z][a-z]+( [A-Z]\.)+ [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "First Second M. Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+$",
    "First Second V. Last": r"^[A-Z][a-z]+ [A-Z][a-z]+( [A-Z]\.)+ [A-Z][a-z]+$",
    "First d? Last": r"^[A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+$",
    "First d? Penultimate Last": r"^[A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+ [A-Z][a-z]+$",
    "First d? Various Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ d[a-z]+ [A-Z][a-z]+$",
    "First Various d? Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ d[a-z]+ [A-Z][a-z]+$",
    "First Second d? Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+$",
    "First Second d? Various Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ d[a-z]+( [A-Z][a-z]+)+ [A-Z][a-z]+$",
    "Last, First": r"^[A-Z][a-z]+, [A-Z][a-z]+$",
    "Last, First Middle": r"^[A-Z][a-z]+, [A-Z][a-z]+ [A-Z][a-z]+$",
    "Last, First M.": r"^[A-Z][a-z]+, [A-Z][a-z]+ [A-Z]\.$",
    "Last, First V.": r"^[A-Z][a-z]+, [A-Z][a-z]+( [A-Z]\.)+$",
    "Last Penultimate, First": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+$",
    "Last Penultimate, First Middle": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+ [A-Z][a-z]+$",
    "Last Penultimate, First Various": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "Last Penultimate, First M.": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+ [A-Z]\.$",
    "Last Penultimate, First V.": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+( [A-Z]\.)+$",

    "SINGLE": r"^[A-Z]+$",
    "FIRST LAST": r"^[A-Z]+ [A-Z]+$",
    "FIRST MIDDLE LAST": r"^[A-Z]+ [A-Z]+ [A-Z]+$",
    "FIRST VARIOUS LAST": r"^[A-Z]+( [A-Z]+)+ [A-Z]+$",
    "FIRST M. LAST": r"^[A-Z]+ [A-Z]\. [A-Z]+$",
    "FIRST V. LAST": r"^[A-Z]+( [A-Z]\.)+( [A-Z]+)$",
    "FIRST M. VARIOUS LAST": r"^[A-Z]+ [A-Z]\. [A-Z]+( [A-Z]+)+$",
    "FIRST V. VARIOUS LAST": r"^[A-Z]+( [A-Z]\.)+ [A-Z]+( [A-Z]+)+$",
    "FIRST SECOND M. LAST": r"^[A-Z]+ [A-Z]+ [A-Z]\. [A-Z]+$",
    "FIRST SECOND V. LAST": r"^[A-Z]+ [A-Z]+( [A-Z]\.)+ [A-Z]+$",
    "FIRST D? LAST": r"^[A-Z]+ D[A-Z]+ [A-Z]+$",
    "FIRST D? PENULTIMATE LAST": r"^[A-Z]+ D[A-Z]+ [A-Z]+ [A-Z]+$",
    "FIRST D? VARIOUS LAST": r"^[A-Z]+( [A-Z]+)+ D[A-Z]+ [A-Z]+$",
    "FIRST VARIOUS D? LAST": r"^[A-Z]+( [A-Z]+)+ D[A-Z]+ [A-Z]+$",
    "FIRST SECOND D? LAST": r"^[A-Z]+ [A-Z]+ D[A-Z]+ [A-Z]+$",
    "FIRST SECOND D? VARIOUS LAST": r"^[A-Z]+ [A-Z]+ D[A-Z]+( [A-Z]+)+ [A-Z]+$",
    "LAST, FIRST": r"^[A-Z]+, [A-Z]+$",
    "LAST, FIRST MIDDLE": r"^[A-Z]+, [A-Z]+ [A-Z]+$",
    "LAST, FIRST M.": r"^[A-Z]+, [A-Z]+ [A-Z]\.$",
    "LAST, FIRST V.": r"^[A-Z]+, [A-Z]+( [A-Z]\.)+$",
    "LAST PENULTIMATE, FIRST": r"^[A-Z]+ [A-Z]+, [A-Z]+$",
    "LAST PENULTIMATE, FIRST MIDDLE": r"^[A-Z]+ [A-Z]+, [A-Z]+ [A-Z]+$",
    "LAST PENULTIMATE, FIRST VARIOUS": r"^[A-Z]+ [A-Z]+, [A-Z]+( [A-Z]+)+$",
    "LAST PENULTIMATE, FIRST M.": r"^[A-Z]+ [A-Z]+, [A-Z]+ [A-Z]\.$",
    "LAST PENULTIMATE, FIRST V.": r"^[A-Z]+ [A-Z]+, [A-Z]+( [A-Z]\.)+$",
}

def detect_name_pattern(name):
    normalized_name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    for pattern_name, pattern in name_patterns.items():
        if re.match(pattern, normalized_name):
            return pattern_name
    return "unknown"

df_ambiguous_citations["name_pattern"] = df_ambiguous_citations["authors"].apply(detect_name_pattern)
df_ambiguous_citations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ambiguous_citations["name_pattern"] = df_ambiguous_citations["authors"].apply(detect_name_pattern)


Unnamed: 0,production_id,authors,name_pattern
4,1,Michel Oei,First Last
5,1,Adriano Bastos,First Last
6,1,Leandro Menezes,First Last
8,2,Fernanda Cláudia Alves Campos,First Various Last
11,2,Leonardo Guerreiro Azevedo,First Middle Last
...,...,...,...
936705,230927,"DINIZ, PEDRO C.","LAST, FIRST M."
936707,230928,Carlos Thadeu Duarte Santos,First Various Last
936708,230928,Anderson Zudio,First Last
936711,230929,Felipe Souza,First Last


In [91]:
df_ambiguous_citations.sample(frac=.1).to_csv(Path("data", "02-preprocessing", "ambiguous_citations.csv"), index=False, sep="\t")

In [88]:
df_ambiguous_citations[df_ambiguous_citations["name_pattern"] == "unknown"]

Unnamed: 0,production_id,authors,name_pattern
166,32,"SILVA, GREGSON BARROS DA",unknown
269,60,Lucas de Assis S. Ferreira,unknown
270,60,Lucas Miguez S. de Jesus,unknown
276,62,Lucas de Assis S. Ferreira,unknown
277,62,Lucas Miguez S. de Jesus,unknown
...,...,...,...
936666,230923,Mauricio Breternitz Jr.,unknown
936676,230924,Mauricio Breternitz Jr.,unknown
936679,230925,"MIRANDA, IGOR D.S.",unknown
936684,230925,"VILLON, LUIS A.Q.",unknown


## 2.3. Citation consistency

In [None]:
df_citations = df_productions_ufpa[["name", "production_id", "authors"]]
df_citations = df_citations.join(df_productions_ufpa["authors"].explode().rename("author"))
df_citations = df_citations.drop(columns=["authors"])
df_citations

In [None]:
data_researcher_citations = {
    "researcher": [],
    "production_id": [],
    "n_citations": []
}

for researcher in df_citations["name"].unique():
    df_researcher = df_citations[df_citations["name"] == researcher]
    for production_id in df_researcher["production_id"].unique():
        df_production = df_researcher[df_researcher["production_id"] == production_id]
        data_researcher_citations["researcher"].append(researcher)
        data_researcher_citations["production_id"].append(production_id)
        data_researcher_citations["n_citations"].append(df_production["author"].to_list().count(researcher))

df_researcher_citations = pd.DataFrame(data_researcher_citations)
df_researcher_citations

In [None]:
df_researcher_citations[df_researcher_citations["n_citations"] == 0]

In [None]:
production_researcher_multiple_citations = df_researcher_citations[df_researcher_citations["n_citations"] > 1]
production_researcher_multiple_citations

In [None]:
production_ids_multiple_citations = production_researcher_multiple_citations["production_id"].unique()

multiple_citations_instances = df_productions[df_productions["production_id"].isin(production_ids_multiple_citations)]["authors"]

multiple_citations_instances = multiple_citations_instances.map(lambda authors: list(set(authors)))

for authors in multiple_citations_instances:
    print(authors)

## 2.4. Citation linkage

In [None]:
df_citation_reference = df_productions[["name", "citation"]].drop_duplicates(subset=["name"])
df_citation_reference

In [None]:
citation_reference = {}

for i, row in df_citation_reference.iterrows():
    for citation in row["citation"]:
        if citation not in citation_reference:
            citation_reference[citation] = row["name"]

print(len(citation_reference))

citation_reference

In [None]:
df_citations["author"] = df_citations["author"].map(lambda author: citation_reference.get(author, author))
df_citations

In [None]:


# ---------------------------------------------------------
# 1) Carregar dados (TAB) e pequenos ajustes
# ---------------------------------------------------------

df_citations["unique_id"] = pd.RangeIndex(start=0, stop=len(df_citations), step=1, dtype="int64")

# Normalizações úteis (sem perder o original)
df_citations["author_norm"] = (
    df_citations["author"]
    .str.normalize("NFKD")
    .str.encode("ascii", errors="ignore")
    .str.decode("utf-8")
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

def split_name(s: str):
    parts = s.split()
    if not parts:
        return "", "", ""
    surname = parts[-1]
    forenames = " ".join(parts[:-1])
    initials = "".join([p[0] for p in parts[:-1] if p])
    return surname, forenames, initials

df_citations[["surname_norm", "forenames_norm", "given_initials"]] = (
    df_citations["author_norm"].apply(lambda s: pd.Series(split_name(s)))
)

# Se existir coluna de ano, padroniza; caso não exista, cria vazia
if "year" in df_citations.columns:
    df_citations["year_norm"] = df_citations["year"].str.extract(r"(\d{4})", expand=False)
else:
    df_citations["year_norm"] = ""

# Se existir título, cria uma versão simplificada (ajuda um pouco na evidência)
if "title" in df_citations.columns:
    df_citations["title_norm"] = (
        df_citations["title"]
        .astype(str)
        .str.normalize("NFKD")
        .str.encode("ascii", errors="ignore")
        .str.decode("utf-8")
        .str.lower()
        .str.replace(r"[^a-z0-9 ]", " ", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
else:
    df_citations["title_norm"] = ""

# ---------------------------------------------------------
# 2) Definir comparações e regras de bloqueio
#    (link_type='dedupe_only' pois é um único dataset)
# ---------------------------------------------------------
# Comparações:
# - author_norm: comparação de nome (fuzzy) com patamares de similaridade
# - year_norm: exato, mas opcional (se ausente, vira vazio e pouco pesa)
# - title_norm: similaridade Jaro para reforço (baixa/mediana evidência)
settings = SettingsCreator(
    link_type="dedupe_only",
    unique_id_column_name="unique_id",
    comparisons=[
        # Sobrenome exato com TFA ligado (sobrenomes raros ganham peso)
        cl.ExactMatch("surname_norm").configure(term_frequency_adjustments=True),

        # Prenomes com Jaro-Winkler em patamares, também com TFA
        cl.JaroWinklerAtThresholds(
            "forenames_norm", [0.99, 0.97, 0.94, 0.90]
        ).configure(term_frequency_adjustments=True),

        # Iniciais de apoio (útil quando prenomes variam)
        cl.ExactMatch("given_initials"),
    ],
    # Bloqueios para gerar pares candidatos (evita comparar tudo com tudo):
    # - mesma primeira letra de author_norm
    # - mesmo comprimento de author_norm (coarse) – ajuda performance
    blocking_rules_to_generate_predictions=[
        # pares com mesmo sobrenome
        block_on("surname_norm"),
        # pares com mesmas iniciais (cobre pequenas variações nos prenomes)
        block_on("given_initials"),
        # pares com prefixo de 4 letras do sobrenome (tolerante a pequenos typos/acentos)
        block_on("substr(surname_norm, 1, 4)"),
    ],
)

# Backend local
db_api = DuckDBAPI()

# Instancia o Linker (v4)
linker = Linker(df_citations, settings, db_api)

# ---------------------------------------------------------
# 3) Estimar parâmetros do modelo (Fellegi–Sunter)
#    - Probabilidade de dois registros aleatórios baterem
#    - u-probabilities por amostragem aleatória
#    - EM para m-probabilities (e refino)
# ---------------------------------------------------------
# Probabilidade base de match (lambda) – ajuda a calibrar o modelo
linker.training.estimate_probability_two_random_records_match(
    [block_on("surname_norm")], recall=0.85  # estime com alta recuperação
)

# Estima u (não-match) por amostragem
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

# EM com bloqueios “fortes” (duas rodadas curtas ajudando a convergir)
linker.training.estimate_parameters_using_expectation_maximisation(block_on("surname_norm"))
linker.training.estimate_parameters_using_expectation_maximisation(block_on("given_initials"))

# ---------------------------------------------------------
# 4) Predizer probabilidades de match par-a-par
#    (pairwise_predictions contém prob_match e match_weight)
# ---------------------------------------------------------
pairwise_predictions = linker.inference.predict(threshold_match_probability=.2)

# Para pandas:
df_exploded_pairs = pairwise_predictions.as_pandas_dataframe()
print(df_exploded_pairs.columns.tolist()[:20])  # espiar colunas principais

# ---------------------------------------------------------
# 5) Clustering: gerar IDs estimados de autor (componentes conectados)
#    threshold_match_probability define quão “estrito” é ligar registros
#    (0.95 é conservador; ajuste conforme precisão/recall desejados)
# ---------------------------------------------------------
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    pairwise_predictions, 0.85
)
df_exploded_clusters = clusters.as_pandas_dataframe()

# ---------------------------------------------------------
# 6) Salvar resultados
# ---------------------------------------------------------
df_exploded_pairs.to_csv("author_pairwise_predictions.tsv", sep="\t", index=False)
df_exploded_clusters.to_csv("author_clusters.tsv", sep="\t", index=False)

print(
    "Arquivos gerados:\n"
    " - author_pairwise_predictions.tsv (pares com prob_match)\n"
    " - author_clusters.tsv (cluster_id estimado por registro)\n"
)


In [None]:
df_exploded_pairs

In [None]:
df_exploded_clusters

In [None]:
df_exploded_clusters["cluster_id"].nunique()

In [None]:
for cluster_id in df_exploded_clusters["cluster_id"].unique():
    cluster = df_exploded_clusters[df_exploded_clusters["cluster_id"] == cluster_id]
    variations = cluster["author"].unique()
    print(f"Cluster {cluster_id}: {' | '.join(variations)}")

In [None]:
with open(os.path.join(DATASET_DIRECTORY, "dataframe.pkl"), "wb") as f:
    pickle.dump(df, f)

In [None]:
df.to_csv(os.path.join(DATASET_DIRECTORY, "dataframe.csv"), index=False, sep="\t")

In [None]:
preprocessing_directory = Path(DATASET_DIRECTORY, "02-preprocessing")

os.makedirs(preprocessing_directory, exist_ok=True)

with open(Path(preprocessing_directory, "productions.pickle"), "wb") as f:
    pickle.dump(df_productions, f)

with open(Path(preprocessing_directory, "productions_ufpa.pickle"), "wb") as f:
    pickle.dump(df_productions_ufpa, f)