In [None]:
from utils import setrootdir
setrootdir("ppgcc-coautorias")

In [None]:
import os
import xml.etree.ElementTree as et
import pickle
from pathlib import Path
import re
import unicodedata

from dotenv import load_dotenv

import pandas as pd

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on

from src.explattes.Pesquisador import Pesquisador

# 2. Preprocessing

In [None]:
load_dotenv()

DATASET_DIRECTORY = os.getenv("DATASET_DIRECTORY")

## 2.1. DataFrames construction

In [None]:
curriculum_dataset_directory = Path(DATASET_DIRECTORY, "01-selection", "curriculolattesprofessoresppgccbrasil")

resumes = {}

for subdir, dirs, files in os.walk(curriculum_dataset_directory):
    for file in files:
        
        filepath = Path(subdir, file)

        institution = filepath.parent.name

        if institution not in resumes:
            resumes[institution] = []

        root = et.parse(filepath).getroot()
        researcher = Pesquisador(root)
        resumes[institution].append(researcher)

### 2.1.1. Productions

In [None]:
data_events = {
    "name": [],
    "citation": [],
    "lattes_id": [],
    "institution": [],
    "production": [],
    "authors": [],
    "location": [],
    "type": [],
    "year": [],
    "issn": [],
}

for institution, researchers in resumes.items():
    for researcher in researchers:
        for producao in researcher.producoes:
            data_events["name"].append(researcher.nome)
            data_events["citation"].append(researcher.citacoes)
            data_events["lattes_id"].append(researcher.id)
            data_events["institution"].append(institution)
            data_events["production"].append(producao.titulo)
            data_events["authors"].append(producao.autores)
            data_events["location"].append(producao.local)
            data_events["type"].append(producao.tipo.name)
            data_events["year"].append(producao.ano)
            data_events["issn"].append(producao.issn)

df_productions = pd.DataFrame(data_events)
df_productions = df_productions.reset_index(names="production_id")
df_productions

In [None]:
df_productions.dtypes

In [None]:
df_productions["year"] = df_productions["year"].map(lambda x: int(x) if str(x).isnumeric() else None)
df_productions = df_productions.astype({"year": "Int64"})
df_productions.dtypes

## 2.2. Productions with no authors

In [None]:
df_productions[df_productions["authors"].map(len) < 1]

In [None]:
df_productions = df_productions[df_productions["authors"].map(len) > 0].reset_index(drop=True)
df_productions

## 2.3. Citation normalization

In [None]:
df_citations = df_productions.drop_duplicates(subset=["lattes_id"])[["name", "citation"]].reset_index(drop=True)
df_citations

In [None]:
citation_variations = [name for citations in df_citations["citation"] for name in citations]

print(f"There is a total of {len(citation_variations)} citation variations.")

variations_counting = {}

for citation in citation_variations:
    if citation not in variations_counting.keys():
        variations_counting[citation] = 0
    variations_counting[citation] += 1

non_duplicated_variations = [citation for citation, count in variations_counting.items() if count == 1]
non_duplicated_variations = set(non_duplicated_variations)

print(f"There is a total of {len(non_duplicated_variations)} non-duplicated citation variations.")

In [None]:
citations_reference = {}

for i, row in df_citations.iterrows():
    for citation in row["citation"]:
        if citation in non_duplicated_variations:
            citations_reference[citation] = row["name"]

print(len(citations_reference))

In [None]:
def normalize_citations(authors):
    normalized_authors = []
    for author in authors:
        if author in non_duplicated_variations:
            normalized_authors.append(citations_reference[author])
        else:
            normalized_authors.append(author)
    return normalized_authors

df_productions["authors"] = df_productions["authors"].apply(normalize_citations)
df_productions

In [None]:
new_df_exploded = df_productions.explode("authors").reset_index(drop=True)
new_df_exploded["authors"].unique()

## 2.4. Ambiguous citations

In [None]:
df_citations_exploded = df_productions[["production_id", "authors"]].explode("authors").reset_index(drop=True)
df_citations_exploded

In [None]:
df_ambiguous_citations = df_citations_exploded[~df_citations_exploded["authors"].isin(non_duplicated_variations)]
df_ambiguous_citations

In [None]:
name_patterns = {
    "Single": r"^[A-Z][a-z]+$",
    "First Last": r"^[A-Z][a-z]+ [A-Z][a-z]+$",
    "First Middle Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ [A-Z][a-z]+$",
    "First Various Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ [A-Z][a-z]+$",
    "First M. Last": r"^[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+$",
    "First V. Last": r"^[A-Z][a-z]+( [A-Z]\.)+( [A-Z][a-z]+)$",
    "First M. Various Last": r"^[A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "First V. Various Last": r"^[A-Z][a-z]+( [A-Z]\.)+ [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "First Second M. Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ [A-Z]\. [A-Z][a-z]+$",
    "First Second V. Last": r"^[A-Z][a-z]+ [A-Z][a-z]+( [A-Z]\.)+ [A-Z][a-z]+$",
    "First d? Last": r"^[A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+$",
    "First d? Penultimate Last": r"^[A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+ [A-Z][a-z]+$",
    "First d? Various Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ d[a-z]+ [A-Z][a-z]+$",
    "First Various d? Last": r"^[A-Z][a-z]+( [A-Z][a-z]+)+ d[a-z]+ [A-Z][a-z]+$",
    "First Second d? Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ d[a-z]+ [A-Z][a-z]+$",
    "First Second d? Various Last": r"^[A-Z][a-z]+ [A-Z][a-z]+ d[a-z]+( [A-Z][a-z]+)+ [A-Z][a-z]+$",
    "Last, First": r"^[A-Z][a-z]+, [A-Z][a-z]+$",
    "Last, First Middle": r"^[A-Z][a-z]+, [A-Z][a-z]+ [A-Z][a-z]+$",
    "Last, First M.": r"^[A-Z][a-z]+, [A-Z][a-z]+ [A-Z]\.$",
    "Last, First V.": r"^[A-Z][a-z]+, [A-Z][a-z]+( [A-Z]\.)+$",
    "Last Penultimate, First": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+$",
    "Last Penultimate, First Middle": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+ [A-Z][a-z]+$",
    "Last Penultimate, First Various": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+( [A-Z][a-z]+)+$",
    "Last Penultimate, First M.": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+ [A-Z]\.$",
    "Last Penultimate, First V.": r"^[A-Z][a-z]+ [A-Z][a-z]+, [A-Z][a-z]+( [A-Z]\.)+$",

    "SINGLE": r"^[A-Z]+$",
    "FIRST LAST": r"^[A-Z]+ [A-Z]+$",
    "FIRST MIDDLE LAST": r"^[A-Z]+ [A-Z]+ [A-Z]+$",
    "FIRST VARIOUS LAST": r"^[A-Z]+( [A-Z]+)+ [A-Z]+$",
    "FIRST M. LAST": r"^[A-Z]+ [A-Z]\. [A-Z]+$",
    "FIRST V. LAST": r"^[A-Z]+( [A-Z]\.)+( [A-Z]+)$",
    "FIRST M. VARIOUS LAST": r"^[A-Z]+ [A-Z]\. [A-Z]+( [A-Z]+)+$",
    "FIRST V. VARIOUS LAST": r"^[A-Z]+( [A-Z]\.)+ [A-Z]+( [A-Z]+)+$",
    "FIRST SECOND M. LAST": r"^[A-Z]+ [A-Z]+ [A-Z]\. [A-Z]+$",
    "FIRST SECOND V. LAST": r"^[A-Z]+ [A-Z]+( [A-Z]\.)+ [A-Z]+$",
    "FIRST D? LAST": r"^[A-Z]+ D[A-Z]+ [A-Z]+$",
    "FIRST D? PENULTIMATE LAST": r"^[A-Z]+ D[A-Z]+ [A-Z]+ [A-Z]+$",
    "FIRST D? VARIOUS LAST": r"^[A-Z]+( [A-Z]+)+ D[A-Z]+ [A-Z]+$",
    "FIRST VARIOUS D? LAST": r"^[A-Z]+( [A-Z]+)+ D[A-Z]+ [A-Z]+$",
    "FIRST SECOND D? LAST": r"^[A-Z]+ [A-Z]+ D[A-Z]+ [A-Z]+$",
    "FIRST SECOND D? VARIOUS LAST": r"^[A-Z]+ [A-Z]+ D[A-Z]+( [A-Z]+)+ [A-Z]+$",
    "LAST, FIRST": r"^[A-Z]+, [A-Z]+$",
    "LAST, FIRST MIDDLE": r"^[A-Z]+, [A-Z]+ [A-Z]+$",
    "LAST, FIRST M.": r"^[A-Z]+, [A-Z]+ [A-Z]\.$",
    "LAST, FIRST V.": r"^[A-Z]+, [A-Z]+( [A-Z]\.)+$",
    "LAST PENULTIMATE, FIRST": r"^[A-Z]+ [A-Z]+, [A-Z]+$",
    "LAST PENULTIMATE, FIRST MIDDLE": r"^[A-Z]+ [A-Z]+, [A-Z]+ [A-Z]+$",
    "LAST PENULTIMATE, FIRST VARIOUS": r"^[A-Z]+ [A-Z]+, [A-Z]+( [A-Z]+)+$",
    "LAST PENULTIMATE, FIRST M.": r"^[A-Z]+ [A-Z]+, [A-Z]+ [A-Z]\.$",
    "LAST PENULTIMATE, FIRST V.": r"^[A-Z]+ [A-Z]+, [A-Z]+( [A-Z]\.)+$",
}

def detect_name_pattern(name):
    normalized_name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    for pattern_name, pattern in name_patterns.items():
        if re.match(pattern, normalized_name):
            return pattern_name
    return "unknown"

df_ambiguous_citations["name_pattern"] = df_ambiguous_citations["authors"].apply(detect_name_pattern)
df_ambiguous_citations

In [None]:
df_ambiguous_citations[df_ambiguous_citations["name_pattern"] == "unknown"]