In [36]:
import json
import re
import requests
import itertools
from collections import defaultdict, Counter

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [37]:
with open("data/data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Preview the first entry
data[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [38]:
required_fields = [
    "orkg categories",
    "papers with code categories",
    "openalex categories",
    "openaire categories"
]

In [39]:
count_all_present = 0
valid_papers = []

for paper in data:
    present_fields = 0
    for field in required_fields:
        if any(len(paper[field].get(sub, [])) > 0 for sub in paper[field]):
            present_fields += 1
    if present_fields == len(required_fields):
        count_all_present += 1
        valid_papers.append(paper)

print(f"Valid papers: {count_all_present}")


Valid papers: 70


In [40]:
category_fields = {
    "orkg categories": ["domains", "methods", "research problems", "tasks"],
    "papers with code categories": ["tasks", "methods", "main_collection_name", "main_collection_area"],
    "openalex categories": ["primary topics", "topics", "concepts"],
    "openaire categories": ["subjects"]
}

In [41]:
for entry in valid_papers:
    for field, subfields in category_fields.items():
        flat_list = []
        if field in entry:
            for sub in subfields:
                cleaned = [s.lower() for s in entry[field].get(sub, []) if isinstance(s, str)]
                entry[field][sub] = cleaned
                flat_list.extend(cleaned)
        entry[f"{field.replace(' ', '_')}_flat"] = list(set(flat_list))


In [42]:
valid_papers[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [43]:
def extract_clean_fields(entry):
    return {
        "title": entry.get("title", ""),
        "doi": entry.get("doi", ""),
        "abstract": entry.get("abstract", ""),
        "orkg_categories_flat": sorted(entry.get("orkg_categories_flat", [])),
        "papers_with_code_categories_flat": sorted(entry.get("papers_with_code_categories_flat", [])),
        "openalex_categories_flat": sorted(entry.get("openalex_categories_flat", [])),
        "openaire_categories_flat": sorted(entry.get("openaire_categories_flat", [])),
    }

initial_data = [extract_clean_fields(p) for p in valid_papers]

In [44]:
initial_data[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [45]:
def fetch_arxiv_taxonomy():
    url = "https://arxiv.org/category_taxonomy"
    response = requests.get(url)
    matches = re.findall(r'<span class="descriptor">([^<]+)</span>\s+\(([^)]+)\)', response.text)
    return {code.lower(): desc.lower() for desc, code in matches}


In [46]:
arxiv_map = fetch_arxiv_taxonomy()

acm_ccs_map = {
    "i.4": "image processing and computer vision",
    "h.2.8": "database applications",
    "h.2": "database management",
    "k.4": "computers and society",
    "d.2": "software engineering",
    "c.2": "computer-communication networks"
}


In [47]:
def clean_openaire_categories(categories, arxiv_map, acm_map):
    decoded_labels = set()
    cleaned = set()

    for cat in categories:
        cat = cat.strip().lower()
        if cat.startswith("fos:"):
            cat = cat[4:]

        for match in re.findall(r"\(([\w\.\-]+)\)", cat):
            if match in arxiv_map:
                decoded_labels.add(arxiv_map[match])
            elif match in acm_map:
                decoded_labels.add(acm_map[match])
        cat = re.sub(r"\([\w\.\-]+\)", "", cat).strip()

        if cat in arxiv_map:
            decoded_labels.add(arxiv_map[cat])
            continue
        if cat in acm_map:
            decoded_labels.add(acm_map[cat])
            continue
        if not re.fullmatch(r"[a-z]\.\d+(\.\d+)?", cat):
            cleaned.add(cat)

    return sorted(cleaned.union(decoded_labels))


In [48]:
for entry in initial_data:
    entry["openaire_categories_flat"] = clean_openaire_categories(
        entry.get("openaire_categories_flat", []),
        arxiv_map,
        acm_ccs_map
    )


In [49]:
initial_data[1]

{'title': 'OLMo: Accelerating the Science of Language Models',
 'doi': '10.18653/v1/2024.acl-long.841',
 'abstract': 'Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open resear

In [50]:
with open("initial_dataset.json", "w", encoding="utf-8") as f:
    json.dump(initial_data, f, indent=4, ensure_ascii=False)