In [3]:
import subprocess
import downloader
from reader import Reader
from ontology import export_ontology

## How does it work?
This notebook takes a list of papers (DOIs and titles) from `data/lists/{named_list}.txt/` and process them as follows:
1. Download and store the papers in `data/data/papers/{named_list}/`
2. Instantiate a `Reader(named_list)` and loads the pdf files along any previously information stored in cache `data/reader/cache/{named_list}.json`
3. Extract metadata using the DOIs and calling external sources
4. Extract topics using different classifiers. At the moment "acm" and "dbpedia" are available.
5. Dump the results to a cached `json` file. This allows you to load them later instead of having to process everything again.
6. Export the data to a tsv file in a format that klink uses as input to build the ontology.
7. Run the klink algorithm (In R scripts). /!!\ If klink2 is already set up with the correct parameters, the R algorithm can be called from Python using this notebook, otherwise, it's recommended to run `input.R` and `klink-2.R` scripts individually. I find easier to use RStudio in this case.
8. Export the ontology to ttl file, so it can be easily loaded and visualised at https://service.tib.eu/webvowl/

In [20]:
named_list = "fairness"

In [3]:
# downloader.download(named_list=named_list)

In [3]:
reader = Reader(named_list)
reader.load(from_inc=0)


Processing 10_1145-3442188_3445865.pdf: 100%|██████████| 201/201 [00:33<00:00,  6.00it/s]     


In [57]:
reader.extract_metadata()

Processing 10.1145/3442188.3445865: 100%|██████████| 31/31 [05:08<00:00,  9.97s/it]     


In [58]:
classifiers = ["acm", "dbpedia"]
reader.extract_classification(*classifiers)

Processing 10.1145/3442188.3445865: 100%|██████████| 31/31 [03:17<00:00,  6.37s/it]     


In [4]:
reader.metadata_collection("dataframe")

Unnamed: 0,file_name,doi,title,authors,journal,publisher,year,keywords,topics_acm,topics_dbpedia
0,10_1145-3404835_3462948.pdf,10.1145/3404835.3462948,Fairness among New Items in Cold Start Recomme...,"Zhu, Ziwei;Kim, Jingu;Nguyen, Trung;Fenton, Ai...",,ACM,2021,fairness;cold start recommendation,information systems;information retrieval;retr...,rawlsian;recommender systems;generative model;...
1,10_1145-3303772_3303791.pdf,10.1145/3303772.3303791,Evaluating the Fairness of Predictive Student ...,"Gardner, Josh;Brooks, Christopher;Baker, Ryan",,ACM,2019,fairness;machine learning;moocs,applied computing;education;computer-assisted ...,learning analytics;metric;moocs;predictive mod...
2,10_1145-3616865.pdf,10.1145/3616865,Fairness in Machine Learning: A Survey,"Caton, Simon;Haas, Christian",ACM Computing Surveys,Association for Computing Machinery (ACM),2023,fairness;accountability;transparency;machine l...,,
3,10_1145-3340531_3411980.pdf,10.1145/3340531.3411980,Fair Class Balancing: Enhancing Model Fairness...,"Yan, Shen;Kao, Hsien-te;Ferrara, Emilio",,ACM,2020,fairness;bias;class balancing,computing methodologies;machine learning;socia...,machine learning;screening;spur
4,10_1145-3359061_3361084.pdf,10.1145/3359061.3361084,Is mutation score a fair metric?,"Souza, Beatriz",,ACM,2019,mutation testing;mutation score;test suite eff...,software and its engineering;software creation...,metric;mutation;test suite
...,...,...,...,...,...,...,...,...,...,...
196,10_1145-3394486_3403199.pdf,10.1145/3394486.3403199,Evaluating Fairness Using Permutation Tests,"DiCiccio, Cyrus;Vasudevan, Sriram;Basu, Kinjal...",,ACM,2020,permutation tests;fairness;asymptotics,information systems;world wide web;web applica...,permutation;machine learning;gravity;metric;hy...
197,10_1145-3531146_3533146.pdf,10.1145/3531146.3533146,Fair Representation Clustering with Several Pr...,"Dai, Zhen;Makarychev, Yury;Vakilian, Ali",,ACM,2022,fair k-median;clustering;approximation algorit...,,k-median problem;metric space;ck;fj;exponentia...
198,10_1109-ICSE48619_2023_00133.pdf,10.1109/ICSE48619.2023.00133,,,,,,fairness;ensemble;machine learning;models,,
199,10_1145-3600211_3604657.pdf,10.1145/3600211.3604657,Fairness Implications of Encoding Protected Ca...,"Mougan, Carlos;Alvarez, Jose Manuel;Ruggieri, ...",,ACM,2023,fairness;algorithmic accountability;categorica...,computing methodologies;machine learning;learn...,


In [70]:
len([1 for paper in reader.paper_list if paper.topics.get("acm") or paper.topics.get("dbpedia")])

176

In [61]:
reader.dump(overwrite=False)

In [64]:
reader.export_as_klink_input(classification_source="acm")

Unnamed: 0,DE,TI,AU,SO,SC,PY
0,fairness;cold start recommendation,Fairness among New Items in Cold Start Recomme...,"Zhu, Ziwei;Kim, Jingu;Nguyen, Trung;Fenton, Ai...",ACM,information systems;information retrieval;retr...,2021
1,fairness;machine learning;moocs,Evaluating the Fairness of Predictive Student ...,"Gardner, Josh;Brooks, Christopher;Baker, Ryan",ACM,applied computing;education;computer-assisted ...,2019
3,fairness;bias;class balancing,Fair Class Balancing: Enhancing Model Fairness...,"Yan, Shen;Kao, Hsien-te;Ferrara, Emilio",ACM,computing methodologies;machine learning;socia...,2020
4,mutation testing;mutation score;test suite eff...,Is mutation score a fair metric?,"Souza, Beatriz",ACM,software and its engineering;software creation...,2019
5,fairness;spatial-temporal applications;privacy,Analysing Fairness of Privacy-Utility Mobility...,"Zhan, Yuting;Haddadi, Hamed;Mashhadi, Afra",ACM,security and privacy;human and societal aspect...,2023
...,...,...,...,...,...,...
192,fairness;named entity recognition;clinical de-...,In the Name of Fairness: Assessing the Bias in...,"Xiao, Yuxin;Lim, Shulammite;Pollard, Tom Josep...",ACM,computing methodologies;artificial intelligenc...,2023
194,decision trees;information gain;domain adaptat...,Domain Adaptive Decision Trees: Implications f...,"Alvarez, Jose M.;Scott, Kristen M.;Berendt, Be...",ACM,computing methodologies;machine learning;learn...,2023
196,permutation tests;fairness;asymptotics,Evaluating Fairness Using Permutation Tests,"DiCiccio, Cyrus;Vasudevan, Sriram;Basu, Kinjal...",ACM,information systems;world wide web;web applica...,2020
199,fairness;algorithmic accountability;categorica...,Fairness Implications of Encoding Protected Ca...,"Mougan, Carlos;Alvarez, Jose Manuel;Ruggieri, ...",ACM,computing methodologies;machine learning;learn...,2023


In [21]:

%cd "./klink2/"
subprocess.run(["Rscript", "main.R", named_list])
%cd "../"

[Errno 2] No such file or directory: './klink2/'
/Users/jmatias/Documents/develop/aiod-ontology/src/klink2
[1] "fairness"
/Users/jmatias/Documents/develop/aiod-ontology/src


  bkms = self.shell.db.get('bookmarks', {})
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
export_ontology(named_list + "-1")