In [1]:
import json
import re
import pandas as pd
from pathlib import Path

In [4]:
def clean_string(string):
    return re.sub(r'\W+','', string).lower() 
    
def load_json(path):
    f = open(path)
    data = json.load(f)
    f.close
    return data

def get_tutorial_reading_list(reading_list_ids):
    reading_list = []
    for section in reading_list_ids:
        for ref_id in section["referencesIds"]:
            if ref_id:
                ref = references_metadata[ref_id]
                reading_list.append({
                    "paperId": ref["paperId"],
                    "key": clean_string(ref["title"] + str(ref["year"])),
                    "acl_id": ref["externalIds"]["ACL"] if "ACL" in ref["externalIds"] else "",
                    "title": ref["title"],
                    "abstract": ref["abstract"] if ref["abstract"] else references_missing_metadata[ref_id]["abstract"],
                    "year": ref["year"] if ref["year"] else references_missing_metadata[ref_id]["year"],
                    "section": section["sectionName"],
                    "subsection": section["subsectionName"]
                })
    return reading_list

def get_tutorial_reading_list_by_sections(reading_list_ids):
    reading_list_refs = []
    for section in reading_list_ids:
        section_refs = []
        for ref_id in section["referencesIds"]:
            if ref_id:
                ref = references_metadata[ref_id]
                section_refs.append({
                    "paperId": ref["paperId"],
                    "key": clean_string(ref["title"] + str(ref["year"])),
                    "title": ref["title"],
                    "abstract": ref["abstract"] if ref["abstract"] else references_missing_metadata[ref_id]["abstract"],
                    "year": ref["year"] if ref["year"] else references_missing_metadata[ref_id]["year"],
                    "section": section["sectionName"],
                    "subsection": section["subsectionName"]
                })
        reading_list_refs.append(section_refs)
    return reading_list_refs

In [5]:
pathlist = []
for year in ["before_2020", "2020", "2021", "2022", "2023", "2024"]:
    pathlist = pathlist + sorted(Path(f"./data/{year}").glob('**/*.json'))
    
with open('./data/references_metadata.json', 'r') as f:
    references_metadata = json.load(f)
with open('./data/references_missing_metadata.json', 'r') as f:
    references_missing_metadata = json.load(f)

proceedings = {}
tutorials = {}

for path in pathlist:
    if ".proceedings.json" in path.parts[-1]:
        content = load_json(path)
        proceedings = proceedings | content

for path in pathlist:
    if ".proceedings.json" not in path.parts[-1]:
        content = load_json(path)
        tutorials = tutorials | content

# Filter stats

In [6]:
lists = [get_tutorial_reading_list(tutorials[tutorial_key]["readingList"]) for proceeding in proceedings.values() for tutorial_key in proceeding["tutorials"]]

print("{} tutorials found in the dataset".format(len(lists)))

121 tutorials found in the dataset


In [7]:
min_references = 3
max_references = 20

lists = []
for proceeding in proceedings.values():
    for tutorial_key in proceeding["tutorials"]:
        reading_list = get_tutorial_reading_list(tutorials[tutorial_key]["readingList"])
        if len(reading_list) >= min_references and len(reading_list) <= max_references:
            lists.append(reading_list)
            
print("{} tutorials remain after the filtering: [min:{} - max:{}] references in their reading lists".format(len(lists), min_references, max_references))

85 tutorials remain after the filtering: [min:3 - max:20] references in their reading lists


# Single-level list csv generation

In [8]:
min_references = 3
max_references = 20

lists = {}

for proceeding in proceedings.values():
    for tutorial_key in proceeding["tutorials"]:
        reading_list = get_tutorial_reading_list(tutorials[tutorial_key]["readingList"])
        if len(reading_list) >= 3 and len(reading_list) <= 20:
            lists[tutorial_key] = {
                "id": tutorial_key,
                "title": tutorials[tutorial_key]["title"],
                "abstract": tutorials[tutorial_key]["abstract"],
                "year": tutorials[tutorial_key]["year"],
                "url": tutorials[tutorial_key]["url"],
                "venues": ", ".join([venue["acronym"] for venue in proceeding["venues"]]),
                "reading_list": reading_list
            }
pd.DataFrame.from_dict(lists, orient='index').to_csv('reading_lists.csv', index=False)   

# Sections-level list csv generation

In [9]:
lists = {}

for proceeding in proceedings.values():
    for tutorial_key in proceeding["tutorials"]:
        reading_list = get_tutorial_reading_list(tutorials[tutorial_key]["readingList"])
        if reading_list != [] and len(reading_list) >= 3 and len(reading_list) <= 20:
            reading_list_by_sections = get_tutorial_reading_list_by_sections(tutorials[tutorial_key]["readingList"])
            for i, section in enumerate(reading_list_by_sections):
                lists[tutorial_key+str(i)] = {
                    "id": tutorial_key,
                    "title": tutorials[tutorial_key]["title"],
                    "abstract": tutorials[tutorial_key]["abstract"],
                    "year": tutorials[tutorial_key]["year"],
                    "url": tutorials[tutorial_key]["url"],
                    "venues": ", ".join([venue["acronym"] for venue in proceeding["venues"]]),
                    "section": section[0]["section"],
                    "subsection": section[0]["subsection"],
                    "reading_list": reading_list
                }
pd.DataFrame.from_dict(lists, orient='index').to_csv('reading_lists_sections.csv', index=False)   