In [1]:
from utils import setrootdir
setrootdir("ppgcc-coautorias")

'Directory ppgcc-coautorias successfully loaded as current working directory.'

In [2]:
import os

from dotenv import load_dotenv

import seaborn as sns

sns.set_style("whitegrid")

from src.transformation import Transformer

In [3]:
load_dotenv()

DATASET_DIRECTORY = os.getenv("DATASET_DIRECTORY")
METADATA_FILE = os.getenv("METADATA_FILE")

# 3. Transformation

In [4]:
transformer = Transformer(
    data_dir=DATASET_DIRECTORY,
    metadata_file=METADATA_FILE,
    step_directory="03-transformation"
)

In [5]:
df_productions = transformer.read_parquet(step="02-preprocessing", name="productions")
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,Wander Gaspar,"International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
1,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,Regina Maria Maciel Braga,"International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
2,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,Fernanda Cláudia Alves Campos,"International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,Jose Maria Nazar David,"International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
4,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,Tatiane Ornelas Martins Alves,"International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
...,...,...,...,...,...,...,...,...,...,...,...
495575,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
495576,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Thiago Lopes Nascimento,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
495577,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Fábio Protti,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
495578,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


In [6]:
unique_names = set(df_productions["name"].unique())
len(unique_names)

1756

In [7]:
unique_authors = set(df_productions["authors"].unique())
unique_authors = unique_authors.difference(unique_names)
len(unique_authors)

121341

In [8]:
unique_names = sorted(list(unique_names))
unique_authors = sorted(list(unique_authors))
unique_names = unique_names + unique_authors
len(unique_names)

123097

In [9]:
i = 0

name_map = {}

for name in unique_names:
    name_map[name] = i
    i += 1

len(name_map)

123097

In [10]:
df_productions["nid"] = df_productions["name"].map(lambda x: name_map[x])
df_productions["aid"] = df_productions["authors"].map(lambda x: name_map[x])
df_productions = df_productions[["production_id", "name", "citation", "nid", "lattes_id", "institution","production", "aid", "authors", "type", "year", "issn"]]
df_productions

Unnamed: 0,production_id,name,citation,nid,lattes_id,institution,production,aid,authors,type,year,issn
0,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",830,3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,119076,Wander Gaspar,PERIODICO,2015,17442621
1,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",830,3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,1384,Regina Maria Maciel Braga,PERIODICO,2015,17442621
2,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",830,3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,43349,Fernanda Cláudia Alves Campos,PERIODICO,2015,17442621
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",830,3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,830,Jose Maria Nazar David,PERIODICO,2015,17442621
4,3,Jose Maria Nazar David,"[Jose Maria Nazar David, DAVID, JOSÉ MARIA N.,...",830,3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,112697,Tatiane Ornelas Martins Alves,PERIODICO,2015,17442621
...,...,...,...,...,...,...,...,...,...,...,...,...
495575,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",952,6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,1050,Luís Felipe Ignácio Cunha,CONFERENCIA,2023,
495576,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",952,6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,113452,Thiago Lopes Nascimento,CONFERENCIA,2023,
495577,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",952,6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,642,Fábio Protti,CONFERENCIA,2023,
495578,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, ARAUJO, L. S., L....",952,6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,1050,Luís Felipe Ignácio Cunha,CONFERENCIA,2023,


In [11]:
transformer.write_parquet(df_productions, step="03-transformation", name="productions")