In [132]:
import json
import re
import requests
import itertools
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [133]:
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)
data[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [134]:
required_fields = [
    "orkg categories",
    "papers with code categories",
    "openalex categories",
    "openaire categories"
]

In [None]:
count_all_present = 0
valid_papers = []
paper_count = 0
for paper in data:
    paper_count += 1
    # print(paper_count)
    all_field_count = 0
    for field in required_fields:
        is_any_subfield_not_empty = False
        for subfield_key in paper[field].keys():
            if  len(paper[field][subfield_key]) > 0:
                is_any_subfield_not_empty = True
        if is_any_subfield_not_empty:
            all_field_count += 1
    if all_field_count >= 4:
        count_all_present += 1
        valid_papers.append(paper)
print(f'Valid papers: {count_all_present}')
# for paper in valid_papers:
#     print(paper["title"])

Valid papers: 70


In [136]:
category_fields = {
    "orkg categories": ["domains", "methods", "research problems", "tasks"],
    "papers with code categories": ["tasks", "methods", "main_collection_name", "main_collection_area"],
    "openalex categories": ["primary topics", "topics", "concepts"],
    "openaire categories": ["subjects"]
}

In [137]:
for entry in valid_papers:
    entry.pop("crossref categories", None)
    for field, subfields in category_fields.items():
        flat_list = []
        if field in entry and isinstance(entry[field], dict):
            for subfield in subfields:
                if subfield in entry[field] and isinstance(entry[field][subfield], list):
                    cleaned = [item.lower() for item in entry[field][subfield] if isinstance(item, str)]
                    entry[field][subfield] = cleaned
                    flat_list.extend(cleaned)
        flat_key = field.replace(" ", "_") + "_flat"
        entry[flat_key] = list(set(flat_list))


In [177]:
for entry in data:
    entry.pop("crossref categories", None)
    for field, subfields in category_fields.items():
        flat_list = []
        if field in entry and isinstance(entry[field], dict):
            for subfield in subfields:
                if subfield in entry[field] and isinstance(entry[field][subfield], list):
                    cleaned = [item.lower() for item in entry[field][subfield] if isinstance(item, str)]
                    entry[field][subfield] = cleaned
                    flat_list.extend(cleaned)
        flat_key = field.replace(" ", "_") + "_flat"
        entry[flat_key] = list(set(flat_list))
data[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [None]:
valid_papers[0]

70

In [178]:
cleaned_data = []
all_data = []

for entry in valid_papers:
    cleaned_entry = {
        "title": entry.get("title", ""),
        "doi": entry.get("doi", ""),
        "abstract": entry.get("abstract", ""),
        "orkg_categories_flat": entry.get("orkg_categories_flat", []),
        "papers_with_code_categories_flat": sorted(entry.get("papers_with_code_categories_flat", [])),
        "openalex_categories_flat": sorted(entry.get("openalex_categories_flat", [])),
        "openaire_categories_flat": sorted(entry.get("openaire_categories_flat", []))
    }
    cleaned_data.append(cleaned_entry)

for entry in data:
    cleaned_entry = {
        "title": entry.get("title", ""),
        "doi": entry.get("doi", ""),
        "abstract": entry.get("abstract", ""),
        "orkg_categories_flat": entry.get("orkg_categories_flat", []),
        "papers_with_code_categories_flat": sorted(entry.get("papers_with_code_categories_flat", [])),
        "openalex_categories_flat": sorted(entry.get("openalex_categories_flat", [])),
        "openaire_categories_flat": sorted(entry.get("openaire_categories_flat", []))
    }
    all_data.append(cleaned_entry)

In [168]:
cleaned_data[0]

{'title': 'MiniCPM: Unveiling the Potential of Small Language Models with Scalable\n  Training Strategies',
 'doi': '10.48550/arxiv.2404.06395',
 'abstract': "The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameters has been met with concerns regarding resource efficiency and practical expense, particularly given the immense cost of experimentation. This scenario underscores the importance of exploring the potential of Small Language Models (SLMs) as a resource-efficient alternative. In this context, we introduce MiniCPM, specifically the 1.2B and 2.4B non-embedding parameter variants, not only excel in their respective categories but also demonstrate capabilities on par with 7B-13B LLMs. While focusing on SLMs, our approach exhibits scalability in both model and data dimensions for future LLM research. Regarding model scaling, we employ extensive model wind tunnel experiments for stable and optimal scaling. For data scaling, we introduce a Warm

In [169]:
def fetch_arxiv_taxonomy():
    url = "https://arxiv.org/category_taxonomy"
    response = requests.get(url)
    html = response.text

    # Extract descriptors and category codes
    pattern = r'<span class="descriptor">([^<]+)</span>\s+\(([^)]+)\)'
    matches = re.findall(pattern, html)
    return {code.lower(): desc.lower() for desc, code in matches}

In [170]:
arxiv_map = fetch_arxiv_taxonomy()
acm_ccs_map = {
    "i.4": "image processing and computer vision",
    "h.2.8": "database applications",
    "h.2": "database management",
    "k.4": "computers and society",
    "d.2": "software engineering",
    "c.2": "computer-communication networks"
    # You can expand this manually or scrape from dl.acm.org/ccs
}

In [171]:
def clean_openaire_categories(categories, arxiv_map, acm_map):
    decoded_labels = set()
    cleaned = set()

    for cat in categories:
        cat = cat.strip().lower()

        # Step 1: Remove FOS prefix
        if cat.startswith("fos:"):
            cat = cat.replace("fos:", "").strip()

        # Step 2: Decode arXiv/ACM codes from parentheses
        code_matches = re.findall(r"\(([\w\.\-]+)\)", cat)
        for code in code_matches:
            if code in arxiv_map:
                decoded_labels.add(arxiv_map[code])
            elif code in acm_map:
                decoded_labels.add(acm_map[code])

        # Remove the (code) part, keep rest
        cat = re.sub(r"\([\w\.\-]+\)", "", cat).strip()

        # Step 3: Decode raw codes directly (skip storing them)
        if cat in arxiv_map:
            decoded_labels.add(arxiv_map[cat])
            continue
        if cat in acm_map:
            decoded_labels.add(acm_map[cat])
            continue
        if re.fullmatch(r"[a-z]\.\d+(\.\d+)?", cat) or re.fullmatch(r"cs\.[a-z]+", cat):
            continue

        # Step 4: Split by 'and' and '-'
        cleaned.add(cat)

    final = cleaned.union(decoded_labels)
    return sorted(final)

In [172]:
for entry in cleaned_data:
    if "openaire_categories_flat" in entry:
        entry["openaire_categories_flat"] = clean_openaire_categories(
            entry["openaire_categories_flat"], arxiv_map, acm_ccs_map
        )

In [173]:
cleaned_data[1]

{'title': 'OLMo: Accelerating the Science of Language Models',
 'doi': '10.18653/v1/2024.acl-long.841',
 'abstract': 'Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings. As their commercial importance has surged, the most powerful models have become closed off, gated behind proprietary interfaces, with important details of their training data, architectures, and development undisclosed. Given the importance of these details in scientifically studying these models, including their biases and potential risks, we believe it is essential for the research community to have access to powerful, truly open LMs. To this end, we have built OLMo, a competitive, truly Open Language Model, to enable the scientific study of language models. Unlike most prior efforts that have only released model weights and inference code, we release OLMo alongside open training data and training and evaluation code. We hope this release will empower the open resear

In [179]:
for entry in all_data:
    if "openaire_categories_flat" in entry:
        entry["openaire_categories_flat"] = clean_openaire_categories(
            entry["openaire_categories_flat"], arxiv_map, acm_ccs_map
        )

In [181]:
len(all_data)

126

In [174]:
with open("cleaned_data.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=4, ensure_ascii=False)

## Statistics

In [191]:
has_pwc_cat = 0
has_orkg_cat = 0
has_openalex_cat = 0
has_openaire_cat = 0
for paper in all_data:
    if paper['openaire_categories_flat']:
        has_openaire_cat += 1
    if paper['openalex_categories_flat']:
        has_openalex_cat += 1
    if paper['papers_with_code_categories_flat']:
        has_pwc_cat += 1
    if paper['orkg_categories_flat']:
        has_orkg_cat += 1

all_papercount = len(all_data)
print(f"#papers with OpenAlex cat: {has_openalex_cat}")
print(f"#papers with OpenAIRE cat: {has_openaire_cat}")
print(f"#papers with PwC cat: {has_pwc_cat}")
print(f"#papers with ORKG cat: {has_orkg_cat}")
print(f"#papers with missing OpenAlex cat: {round((1-has_openalex_cat/all_papercount)*100, 2)}")
print(f"#papers with missing OpenAIRE cat: {round((1-has_openaire_cat/all_papercount)*100, 2)}")
print(f"#papers with missing PwC cat: {round((1-has_pwc_cat/all_papercount)*100, 2)}")
print(f"#papers with missing ORKG cat: {round((1-has_orkg_cat/all_papercount)*100, 2)}")

#papers with OpenAlex cat: 120
#papers with OpenAIRE cat: 98
#papers with PwC cat: 92
#papers with ORKG cat: 93
#papers with missing OpenAlex cat: 4.76
#papers with missing OpenAIRE cat: 22.22
#papers with missing PwC cat: 26.98
#papers with missing ORKG cat: 26.19


In [194]:
pwc_cat_cnt = []
orkg_cat_cnt = []
openalex_cat_cnt = []
openaire_cat_cnt = []
for paper in cleaned_data:
    if paper['openaire_categories_flat']:
        openaire_cat_cnt.append(len(paper['openaire_categories_flat']))
    if paper['openalex_categories_flat']:
        openalex_cat_cnt.append(len(paper['openalex_categories_flat']))
    if paper['papers_with_code_categories_flat']:
        pwc_cat_cnt.append(len(paper['papers_with_code_categories_flat']))
    if paper['orkg_categories_flat']:
        orkg_cat_cnt.append(len(paper['orkg_categories_flat']))

print(f"avg OpenAlex cats: {round(sum(openalex_cat_cnt)/len(openalex_cat_cnt), 2)}")
print(f"avg OpenAIRE cats: {round(sum(openaire_cat_cnt)/len(openaire_cat_cnt), 2)}")
print(f"avg with PwC cats: {round(sum(pwc_cat_cnt)/len(pwc_cat_cnt), 2)}")
print(f"avg ORKG cat: {round(sum(orkg_cat_cnt)/len(orkg_cat_cnt), 2)}")

avg OpenAlex cats: 12.39
avg OpenAIRE cats: 7.51
avg with PwC cats: 16.73
avg ORKG cat: 2.83


In [114]:
def build_global_term_stats_and_mappings(data, min_support=2, similarity_threshold=0.8):
    # Step 1: Determine category fields (ending in "_flat")
    category_fields = [key for key in data[0].keys() if key.endswith("_flat")]
    print(f"Using fields: {category_fields}")

    # Step 2: Count term frequency and store SKG sources
    term_occurrence = Counter()
    term_sources = defaultdict(set)

    for paper in data:
        for field in category_fields:
            for term in paper.get(field, []):
                clean_term = term.strip().lower()
                term_occurrence[clean_term] += 1
                term_sources[clean_term].add(field)

    # Step 3: Filter terms by minimum support
    valid_terms = {term for term, count in term_occurrence.items() if count >= min_support}
    terms_list = sorted(valid_terms)

    # Step 4: TF-IDF and cosine similarity
    tfidf = TfidfVectorizer(analyzer='word', lowercase=True).fit_transform(terms_list)
    cosine_sim = cosine_similarity(tfidf)

    # Step 5: Create final mappings
    mappings = []
    for i, term1 in enumerate(terms_list):
        for j in range(i + 1, len(terms_list)):
            term2 = terms_list[j]
            sim = cosine_sim[i, j]
            if sim >= similarity_threshold:
                mappings.append({
                    "term_1": term1,
                    "term_2": term2,
                    "similarity": round(float(sim), 3),
                    "term_1_sources": sorted(term_sources[term1]),
                    "term_2_sources": sorted(term_sources[term2]),
                    "term_1_support": term_occurrence[term1],
                    "term_2_support": term_occurrence[term2]
                })

    # Step 6: Term stats dictionary
    term_stats = {
        term: {
            "support": term_occurrence[term],
            "sources": sorted(term_sources[term])
        } for term in sorted(valid_terms)
    }

    return term_stats, mappings

In [115]:
term_stats, term_mappings = build_global_term_stats_and_mappings(
        data=cleaned_data,
        min_support=2,
        similarity_threshold=0.8
    )

Using fields: ['orkg_categories_flat', 'papers_with_code_categories_flat', 'openalex_categories_flat', 'openaire_categories_flat']


In [116]:
len(term_mappings)

9