In [41]:
import json
import re
import requests
import itertools
from collections import defaultdict, Counter
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt

In [42]:
with open("data/all_data.json", "r", encoding="utf-8") as f:
    all_data = json.load(f)

with open("data/cleaned_data.json", "r", encoding="utf-8") as f:
    cleaned_data = json.load(f)

with open("data/gold_standard.json", "r", encoding="utf-8") as f:
    golden_standard = json.load(f)

In [43]:
has_pwc_cat = 0
has_orkg_cat = 0
has_openalex_cat = 0
has_openaire_cat = 0
for paper in all_data:
    if paper['openaire_categories_flat']:
        has_openaire_cat += 1
    if paper['openalex_categories_flat']:
        has_openalex_cat += 1
    if paper['papers_with_code_categories_flat']:
        has_pwc_cat += 1
    if paper['orkg_categories_flat']:
        has_orkg_cat += 1

all_papercount = len(all_data)
print(f"#papers with OpenAlex cat: {has_openalex_cat}")
print(f"#papers with OpenAIRE cat: {has_openaire_cat}")
print(f"#papers with PwC cat: {has_pwc_cat}")
print(f"#papers with ORKG cat: {has_orkg_cat}")
print(f"#papers with missing OpenAlex cat: {round((1-has_openalex_cat/all_papercount)*100, 2)}")
print(f"#papers with missing OpenAIRE cat: {round((1-has_openaire_cat/all_papercount)*100, 2)}")
print(f"#papers with missing PwC cat: {round((1-has_pwc_cat/all_papercount)*100, 2)}")
print(f"#papers with missing ORKG cat: {round((1-has_orkg_cat/all_papercount)*100, 2)}")

#papers with OpenAlex cat: 120
#papers with OpenAIRE cat: 98
#papers with PwC cat: 92
#papers with ORKG cat: 93
#papers with missing OpenAlex cat: 4.76
#papers with missing OpenAIRE cat: 22.22
#papers with missing PwC cat: 26.98
#papers with missing ORKG cat: 26.19


In [44]:
pwc_cat_cnt = []
orkg_cat_cnt = []
openalex_cat_cnt = []
openaire_cat_cnt = []
for paper in cleaned_data:
    if paper['openaire_categories_flat']:
        openaire_cat_cnt.append(len(paper['openaire_categories_flat']))
    if paper['openalex_categories_flat']:
        openalex_cat_cnt.append(len(paper['openalex_categories_flat']))
    if paper['papers_with_code_categories_flat']:
        pwc_cat_cnt.append(len(paper['papers_with_code_categories_flat']))
    if paper['orkg_categories_flat']:
        orkg_cat_cnt.append(len(paper['orkg_categories_flat']))

print(f"avg OpenAlex cats cleaned: {round(sum(openalex_cat_cnt)/len(openalex_cat_cnt), 2)}")
print(f"avg OpenAIRE cats cleaned: {round(sum(openaire_cat_cnt)/len(openaire_cat_cnt), 2)}")
print(f"avg with PwC cats cleaned: {round(sum(pwc_cat_cnt)/len(pwc_cat_cnt), 2)}")
print(f"avg ORKG cat cleaned: {round(sum(orkg_cat_cnt)/len(orkg_cat_cnt), 2)}")

avg OpenAlex cats cleaned: 12.39
avg OpenAIRE cats cleaned: 7.43
avg with PwC cats cleaned: 16.73
avg ORKG cat cleaned: 2.83


In [45]:
pwc_cat_cnt = []
orkg_cat_cnt = []
openalex_cat_cnt = []
openaire_cat_cnt = []
for paper in golden_standard:
    if paper['openaire_categories_flat']:
        openaire_cat_cnt.append(len(paper['openaire_categories_flat']))
    if paper['openalex_categories_flat']:
        openalex_cat_cnt.append(len(paper['openalex_categories_flat']))
    if paper['papers_with_code_categories_flat']:
        pwc_cat_cnt.append(len(paper['papers_with_code_categories_flat']))
    if paper['orkg_categories_flat']:
        orkg_cat_cnt.append(len(paper['orkg_categories_flat']))

print(f"avg OpenAlex cats golden: {round(sum(openalex_cat_cnt)/len(openalex_cat_cnt), 2)}")
print(f"avg OpenAIRE cats golden: {round(sum(openaire_cat_cnt)/len(openaire_cat_cnt), 2)}")
print(f"avg with PwC cats golden: {round(sum(pwc_cat_cnt)/len(pwc_cat_cnt), 2)}")
print(f"avg ORKG cat golden: {round(sum(orkg_cat_cnt)/len(orkg_cat_cnt), 2)}")

avg OpenAlex cats golden: 4.84
avg OpenAIRE cats golden: 3.67
avg with PwC cats golden: 4.66
avg ORKG cat golden: 1.93


In [55]:
categories = set([])
orkg_categories = set([])
pwc_categories = set([])
openalex_categories = set([])
openaire_categories = set([])

for paper in cleaned_data:
    for cat in paper['openaire_categories_flat']:
        categories.add(cat)
        openaire_categories.add(cat)
    for cat in paper['openalex_categories_flat']:
        categories.add(cat)
        openalex_categories.add(cat)
    for cat in paper['papers_with_code_categories_flat']:
        categories.add(cat)
        pwc_categories.add(cat)
    for cat in paper['orkg_categories_flat']:
        categories.add(cat)
        orkg_categories.add(cat)

print(f"#all categories cleaned: {len(list(categories))}")
print(f"#ORKG categories cleaned: {len(list(orkg_categories))}")
print(f"#PwC categories cleaned: {len(list(pwc_categories))}")
print(f"#OpenAlex categories cleaned: {len(list(openalex_categories))}")
print(f"#OpenAIRE categories cleaned: {len(list(openaire_categories))}")

#all categories cleaned: 728
#ORKG categories cleaned: 133
#PwC categories cleaned: 198
#OpenAlex categories cleaned: 277
#OpenAIRE categories cleaned: 157


In [56]:
categories = set([])
orkg_categories = set([])
pwc_categories = set([])
openalex_categories = set([])
openaire_categories = set([])

for paper in golden_standard:
    for cat in paper['openaire_categories_flat']:
        categories.add(cat)
        openaire_categories.add(cat)
    for cat in paper['openalex_categories_flat']:
        categories.add(cat)
        openalex_categories.add(cat)
    for cat in paper['papers_with_code_categories_flat']:
        categories.add(cat)
        pwc_categories.add(cat)
    for cat in paper['orkg_categories_flat']:
        categories.add(cat)
        orkg_categories.add(cat)

print(f"#all categories golden: {len(list(categories))}")
print(f"#ORKG categories golden: {len(list(orkg_categories))}")
print(f"#PwC categories golden: {len(list(pwc_categories))}")
print(f"#OpenAlex categories golden: {len(list(openalex_categories))}")
print(f"#OpenAIRE categories golden: {len(list(openaire_categories))}")

#all categories golden: 300
#ORKG categories golden: 75
#PwC categories golden: 119
#OpenAlex categories golden: 96
#OpenAIRE categories golden: 38


In [48]:
# Mapping of category to SKGs it's found in
category_to_skgs = defaultdict(set)

for paper in cleaned_data:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    for skg, cats in skg_cats.items():
        for cat in cats:
            category_to_skgs[cat].add(skg)

# Count how many categories appear in 1, 2, 3, or 4 SKGs
agreement_counter = Counter()
for cat, skgs in category_to_skgs.items():
    agreement_counter[len(skgs)] += 1

# Total unique categories
total_unique_cats = len(category_to_skgs)

print(f"\nTotal unique categories: {total_unique_cats}")
print("\nAgreement levels:")
for k in range(1, 5):
    print(f"Categories appearing in {k} SKGs: {agreement_counter[k]}")



Total unique categories: 728

Agreement levels:
Categories appearing in 1 SKGs: 695
Categories appearing in 2 SKGs: 29
Categories appearing in 3 SKGs: 4
Categories appearing in 4 SKGs: 0


In [49]:
# Mapping of category to SKGs it's found in
category_to_skgs = defaultdict(set)

for paper in golden_standard:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    for skg, cats in skg_cats.items():
        for cat in cats:
            category_to_skgs[cat].add(skg)

# Count how many categories appear in 1, 2, 3, or 4 SKGs
agreement_counter = Counter()
for cat, skgs in category_to_skgs.items():
    agreement_counter[len(skgs)] += 1

# Total unique categories
total_unique_cats = len(category_to_skgs)

print(f"\nTotal unique categories: {total_unique_cats}")
print("\nAgreement levels:")
for k in range(1, 5):
    print(f"Categories appearing in {k} SKGs: {agreement_counter[k]}")




Total unique categories: 300

Agreement levels:
Categories appearing in 1 SKGs: 277
Categories appearing in 2 SKGs: 18
Categories appearing in 3 SKGs: 5
Categories appearing in 4 SKGs: 0


In [50]:
paper_overlap_counter = Counter()

for paper in cleaned_data:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    # Build reverse map: category → list of SKGs
    category_skg_map = {}
    for skg, cats in skg_cats.items():
        for cat in cats:
            category_skg_map.setdefault(cat, set()).add(skg)

    # Count how many categories appear in how many SKGs
    overlap_levels = Counter()
    for skgs in category_skg_map.values():
        overlap_levels[len(skgs)] += 1

    # Add to overall paper-level count
    for level in [2, 3, 4]:
        if overlap_levels[level] > 0:
            paper_overlap_counter[level] += 1

# Print results
print("Number of papers with at least one overlapping category in:")
print(f"- 2 SKGs: {paper_overlap_counter[2]}")
print(f"- 3 SKGs: {paper_overlap_counter[3]}")
print(f"- 4 SKGs: {paper_overlap_counter[4]}")

Number of papers with at least one overlapping category in:
- 2 SKGs: 43
- 3 SKGs: 1
- 4 SKGs: 0


In [51]:
paper_overlap_counter = Counter()

for paper in golden_standard:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    # Build reverse map: category → list of SKGs
    category_skg_map = {}
    for skg, cats in skg_cats.items():
        for cat in cats:
            category_skg_map.setdefault(cat, set()).add(skg)

    # Count how many categories appear in how many SKGs
    overlap_levels = Counter()
    for skgs in category_skg_map.values():
        overlap_levels[len(skgs)] += 1

    # Add to overall paper-level count
    for level in [2, 3, 4]:
        if overlap_levels[level] > 0:
            paper_overlap_counter[level] += 1

# Print results
print("Number of papers with at least one overlapping category in:")
print(f"- 2 SKGs: {paper_overlap_counter[2]}")
print(f"- 3 SKGs: {paper_overlap_counter[3]}")
print(f"- 4 SKGs: {paper_overlap_counter[4]}")

Number of papers with at least one overlapping category in:
- 2 SKGs: 65
- 3 SKGs: 2
- 4 SKGs: 0


In [52]:
# Dictionary to hold paper titles per overlap level
papers_with_overlap = {
    2: [],
    3: [],
    4: []
}

for paper in cleaned_data:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    # Build reverse map: category → set of SKGs it appears in
    category_skg_map = {}
    for skg, cats in skg_cats.items():
        for cat in cats:
            category_skg_map.setdefault(cat, set()).add(skg)

    # Count categories by their SKG overlap level
    overlap_levels = {k: 0 for k in [2, 3, 4]}
    for skgs in category_skg_map.values():
        if 2 <= len(skgs) <= 4:
            overlap_levels[len(skgs)] += 1

    # Save paper title if there's at least one category for that level
    for level in [2, 3, 4]:
        if overlap_levels[level] > 0:
            papers_with_overlap[level].append(paper["title"])

# Print results
for level in [2, 3, 4]:
    print(f"\nPapers with at least one category in {level} SKGs ({len(papers_with_overlap[level])} papers):")
    for title in papers_with_overlap[level]:
        print(f"- {title}")



Papers with at least one category in 2 SKGs (43 papers):
- MiniCPM: Unveiling the Potential of Small Language Models with Scalable
  Training Strategies
- Enhancing text-based knowledge graph completion with zero-shot large language models: A focus on semantic enhancement
- COCONut: Modernizing COCO Segmentation
- Annotation Errors and NER: A Study with OntoNotes 5.0
- Understanding and Tackling Label Errors in Individual-Level Nature
  Language Understanding
- Human Evaluation of Procedural Knowledge Graph Extraction from Text with Large Language Models
- TinyLlama: An Open-Source Small Language Model
- Self-Contrast: Better Reflection Through Inconsistent Solving Perspectives
- Search-in-the-Chain: Interactively Enhancing Large Language Models with Search for Knowledge-intensive Tasks
- The Power of Noise: Redefining Retrieval for RAG Systems
- Retrieval meets Long Context Large Language Models
- Corrective Retrieval Augmented Generation
- UniMS-RAG: A Unified Multi-source Retrieval

In [53]:
# Dictionary to hold paper titles per overlap level
papers_with_overlap = {
    2: [],
    3: [],
    4: []
}

for paper in golden_standard:
    skg_cats = {
        "orkg": set(paper["orkg_categories_flat"]),
        "pwc": set(paper["papers_with_code_categories_flat"]),
        "openalex": set(paper["openalex_categories_flat"]),
        "openaire": set(paper["openaire_categories_flat"]),
    }

    # Build reverse map: category → set of SKGs it appears in
    category_skg_map = {}
    for skg, cats in skg_cats.items():
        for cat in cats:
            category_skg_map.setdefault(cat, set()).add(skg)

    # Count categories by their SKG overlap level
    overlap_levels = {k: 0 for k in [2, 3, 4]}
    for skgs in category_skg_map.values():
        if 2 <= len(skgs) <= 4:
            overlap_levels[len(skgs)] += 1

    # Save paper title if there's at least one category for that level
    for level in [2, 3, 4]:
        if overlap_levels[level] > 0:
            papers_with_overlap[level].append(paper["title"])

# Print results
for level in [2, 3, 4]:
    print(f"\nPapers with at least one category in {level} SKGs ({len(papers_with_overlap[level])} papers):")
    for title in papers_with_overlap[level]:
        print(f"- {title}")



Papers with at least one category in 2 SKGs (65 papers):
- MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies
- OLMo: Accelerating the Science of Language Models
- Enhancing text-based knowledge graph completion with zero-shot large language models: A focus on semantic enhancement
- COCONut: Modernizing COCO Segmentation
- Annotation Errors and NER: A Study with OntoNotes 5.0
- Understanding and Tackling Label Errors in Individual-Level Nature Language Understanding
- Human Evaluation of Procedural Knowledge Graph Extraction from Text with Large Language Models
- Structure Guided Large Language Model for SQL Generation
- TinyLlama: An Open-Source Small Language Model
- Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone
- Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context
- Self-Contrast: Better Reflection Through Inconsistent Solving Perspectives
- Self-Refine Instruction-Tuning for Al

In [54]:
from sklearn.metrics import precision_score, recall_score, f1_score

skg_keys = {
    "pwc": "papers_with_code_categories_flat",
    "openalex": "openalex_categories_flat",
    "openaire": "openaire_categories_flat",
    "orkg": "orkg_categories_flat"
}

# Compute metrics per SKG
results = {}

for skg, key in skg_keys.items():
    y_true_all = []
    y_pred_all = []

    for paper_clean, paper_gold in zip(cleaned_data, golden_standard):
        gold_cats = set(cat.lower() for cat in paper_gold.get(key, []))
        pred_cats = set(cat.lower() for cat in paper_clean.get(key, []))

        all_cats = sorted(gold_cats | pred_cats)
        y_true = [1 if c in gold_cats else 0 for c in all_cats]
        y_pred = [1 if c in pred_cats else 0 for c in all_cats]

        y_true_all.extend(y_true)
        y_pred_all.extend(y_pred)

    precision = precision_score(y_true_all, y_pred_all, zero_division=0)
    recall = recall_score(y_true_all, y_pred_all, zero_division=0)
    f1 = f1_score(y_true_all, y_pred_all, zero_division=0)

    results[skg] = {
        "precision": round(precision, 2),
        "recall": round(recall, 2),
        "f1_score": round(f1, 2)
    }

# Output results
for skg, metrics in results.items():
    print(f"{skg.upper()}: Precision={metrics['precision']}, Recall={metrics['recall']}, F1-score={metrics['f1_score']}")

PWC: Precision=0.27, Recall=0.99, F1-score=0.42
OPENALEX: Precision=0.39, Recall=1.0, F1-score=0.56
OPENAIRE: Precision=0.35, Recall=0.72, F1-score=0.47
ORKG: Precision=0.66, Recall=0.98, F1-score=0.79


In [60]:
gold_index = {paper["title"].lower(): paper for paper in golden_standard}

# SKGs and inconsistency counters
skg_keys = {
    "pwc": "papers_with_code_categories_flat",
    "orkg": "orkg_categories_flat",
    "openalex": "openalex_categories_flat",
    "openaire": "openaire_categories_flat"
}

inconsistencies = {
    skg: {
        "coverage_inconsistency": 0,
        "incorrect_assignment": 0,
    } for skg in skg_keys
}

# Go through each paper in cleaned data that is also in gold
for paper in cleaned_data:
    title = paper["title"].lower()
    if title not in gold_index:
        continue
    gold_paper = gold_index[title]

    for skg, skg_field in skg_keys.items():
        skg_labels = set(paper[skg_field])
        gold_labels = set(gold_paper[skg_field])

        # Coverage inconsistency
        if len(skg_labels) <= 1 and len(gold_labels) >= 3:
            inconsistencies[skg]["coverage_inconsistency"] += 1

        # Incorrect assignment (labels not in gold)
        if len(skg_labels - gold_labels) > 0:
            inconsistencies[skg]["incorrect_assignment"] += 1

# Output results
print("Inconsistency counts per SKG:")
for skg in skg_keys:
    print(f"{skg.upper()}: Coverage={inconsistencies[skg]['coverage_inconsistency']}, "
          f"Incorrect={inconsistencies[skg]['incorrect_assignment']}")

Inconsistency counts per SKG:
PWC: Coverage=0, Incorrect=54
ORKG: Coverage=0, Incorrect=30
OPENALEX: Coverage=0, Incorrect=60
OPENAIRE: Coverage=0, Incorrect=61


In [65]:
gold_index = {paper["title"].lower(): paper for paper in golden_standard}
skg_keys = {
    "pwc": "papers_with_code_categories_flat",
    "orkg": "orkg_categories_flat",
    "openalex": "openalex_categories_flat",
    "openaire": "openaire_categories_flat"
}

extra_counts = defaultdict(int)
paper_counts = defaultdict(int)
initial_total = defaultdict(int)
gold_total = defaultdict(int)

for paper in cleaned_data:
    title = paper["title"].lower()
    if title not in gold_index:
        continue
    gold_paper = gold_index[title]

    for skg, field in skg_keys.items():
        cleaned_labels = set(paper[field])
        gold_labels = set(gold_paper[field])
        extras = cleaned_labels - gold_labels

        extra_counts[skg] += len(extras)
        paper_counts[skg] += 1
        initial_total[skg] += len(cleaned_labels)
        gold_total[skg] += len(gold_labels)

# Print results
print("SKG - Initial - Gold - Incorrect - AvgIncorrect")
for skg in skg_keys:
    total_init = initial_total[skg]
    total_gold = gold_total[skg]
    total_extra = extra_counts[skg]
    avg_extra = total_extra / paper_counts[skg]
    print(f"{skg.upper()} - {total_init} - {total_gold} - {total_extra} - {avg_extra:.2f}")

SKG - Initial - Gold - Incorrect - AvgIncorrect
PWC - 1018 - 243 - 779 - 12.56
ORKG - 184 - 123 - 64 - 1.03
OPENALEX - 801 - 311 - 490 - 7.90
OPENAIRE - 479 - 228 - 317 - 5.11
