In [1]:
from utils import setrootdir
setrootdir("ppgcc-coautorias")

'Directory ppgcc-coautorias successfully loaded as current working directory.'

In [2]:
import os

from dotenv import load_dotenv

from src.preprocessing import Preprocesser

In [3]:
load_dotenv()

DATASET_DIRECTORY = os.getenv("DATASET_DIRECTORY")
METADATA_FILE = os.getenv("METADATA_FILE")

# 2. Preprocessing

In [4]:
preprocesser = Preprocesser(
    data_dir=DATASET_DIRECTORY,
    metadata_file=METADATA_FILE,
    step_directory="02-preprocessing",
)

## 2.1. DataFrames construction

In [5]:
institution_registry = preprocesser.extract_institution_registry(
    selection_dir_name="01-selection"
)

institution_registry

InstitutionRegistry
- Institutions: 67
- Total researchers: 1810
- Researchers per institution:
	- Average: 27.01
	- Median: 23
	- P25: 15.0, P75: 32.0
- Largest: UFPE (89)
- Smallest: UERN (7)
- Top 3 institutions: UFPE (89), USP-SC (77), UFMG (66)

### 2.1.1. Productions

In [6]:
df_productions = preprocesser.frame_productions_all_institutions(institution_registry)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos R. S. Borges]",International Journal of Computer Applications...,PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230926,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,CONFERENCIA,2022,
230927,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., PAILLARD, GABRIEL A. L., Leand...",2023 IEEE International Parallel and Distribut...,CONFERENCIA,2023,
230928,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
230929,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


## 2.2. Productions with no authors

In [8]:
df_productions = preprocesser.drop_authorless_productions(df_productions)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos R. S. Borges]",International Journal of Computer Applications...,PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230912,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,CONFERENCIA,2022,
230913,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., PAILLARD, GABRIEL A. L., Leand...",2023 IEEE International Parallel and Distribut...,CONFERENCIA,2023,
230914,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
230915,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


## 2.3. Citation normalization

In [9]:
df_productions = preprocesser.normalize_authors(df_productions)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,"[Jose Maria Nazar David, Marcos Roberto da Sil...",International Journal of Computer Applications...,PERIODICO,2004,09528091
1,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,"[Rita Suzana Pitangueira Maciel, Jose Maria Na...",JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
2,2,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Management of Scientific Experiments in Comput...,"[Regina Maria Maciel Braga, Fernanda Cláudia A...",iSys - Revista Brasileira de Sistemas de Infor...,PERIODICO,2012,19842902
3,3,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Scientific Provenance Metadata Capture and Man...,"[Wander Gaspar, Regina Maria Maciel Braga, Fer...","International Journal of Metadata, Semantics a...",PERIODICO,2015,17442621
4,4,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,PERSONNA: proposta de ontologia de contexto e ...,"[Paulo Rezende, Crystiam Kelle Pereira, Fernan...",Revista Brasileira de Informática na Educação,PERIODICO,2015,14145685
...,...,...,...,...,...,...,...,...,...,...,...
230912,230926,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Pruning Weightless Neural Networks,"[Zachary Susskind, Igor Dantas Dos Santos Mira...",30th European Symposium on Artificial Neural N...,CONFERENCIA,2022,
230913,230927,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,GSink - A Runtime for Gamma Programs and its C...,"[MELLO, RUI R., Gabriel Antoine Louis Paillard...",2023 IEEE International Parallel and Distribut...,CONFERENCIA,2023,
230914,230928,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Heuristics for t-admissibility with complex ne...,"[Carlos Thadeu Duarte Santos, Anderson Zudio, ...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
230915,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,"[Felipe Souza, Leandro Santiago de Araujo, Luí...",SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


In [10]:
df_productions = preprocesser.explode_authors(df_productions)
df_productions

Unnamed: 0,production_id,name,citation,lattes_id,institution,production,authors,location,type,year,issn
0,0,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,Jose Maria Nazar David,International Journal of Computer Applications...,PERIODICO,2004,09528091
1,0,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Designing collaboration through a web-based gr...,Marcos Roberto da Silva Borges,International Journal of Computer Applications...,PERIODICO,2004,09528091
2,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Rita Suzana Pitangueira Maciel,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
3,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Jose Maria Nazar David,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
4,1,Jose Maria Nazar David,"[Jose Maria Nazar David, MARIA DAVID, JOSÉ, DA...",3640497501056163,UFJF,Supporting Awareness in Groupware through an A...,Michel Oei,JOURNAL OF UNIVERSAL COMPUTER SCIENCE,PERIODICO,2009,0948695X
...,...,...,...,...,...,...,...,...,...,...,...
936713,230929,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Gerando base de grafos não isomorfos com seus ...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936714,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Thiago Lopes Nascimento,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936715,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Fábio Protti,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,
936716,230930,Leandro Santiago de Araujo,"[Leandro Santiago de Araujo, Leandro Santiago,...",6358983442870515,UFF,Paralelismo e Heurísticas para o problema da m...,Luís Felipe Ignácio Cunha,SBPO 2023: Simpósio Brasileiro de Pesquisa Ope...,CONFERENCIA,2023,


In [12]:
preprocesser.write_parquet(df_productions, step="02-preprocessing", name="productions")