In [None]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from collections import Counter

# Setup
base_dir = "../data/dataset"
conferences = [
    "ICLR_2017", "ICLR_2018", "ICLR_2019", "ICLR_2020",
    "NIPS_2016", "NIPS_2017", "NIPS_2018", "NIPS_2019"
]

# Initialize structures
titles = []
paper_ids = []
paper_decisions = {}

# Load papers and decisions from all conferences
for conf in conferences:
    paper_dir = os.path.join(base_dir, f"{conf}/{conf}_paper")

    for fname in os.listdir(paper_dir):
        if fname.endswith(".json"):
            with open(os.path.join(paper_dir, fname), "r", encoding="utf-8") as f:
                paper = json.load(f)
                pid = paper.get("id")
                title = paper.get("title", "")
                decision = paper.get("decision", "Unknown")

                if pid and title:
                    titles.append(title)
                    paper_ids.append(pid)
                    paper_decisions[pid] = decision

# Normalize decision function
def normalize_decision(raw_decision):
    if not raw_decision:
        return "Other"
    d = raw_decision.strip().lower()
    if d.startswith("accept"):
        return "Accept"
    elif d.startswith("reject"):
        return "Reject"
    else:
        return "Other"

# Encode titles and build FAISS index
model = SentenceTransformer("all-MiniLM-L6-v2")
title_embeddings = model.encode(titles, convert_to_tensor=False)
title_embeddings_np = np.array(title_embeddings, dtype='float32')

index = faiss.IndexFlatL2(title_embeddings_np.shape[1])
index.add(title_embeddings_np)

# Estimate decision using majority vote
def estimate_baseline_decision(query_title, k=10):
    query_vec = model.encode(query_title, convert_to_tensor=False)
    query_vec_np = np.array([query_vec], dtype='float32')
    D, I = index.search(query_vec_np, k + 1)
    top_k_indices = I[0][1:]

    decision_counts = {}
    for idx in top_k_indices:
        pid = paper_ids[idx]
        raw_decision = paper_decisions.get(pid, "Unknown")
        decision = raw_decision
        decision_counts[decision] = decision_counts.get(decision, 0) + 1

    estimated = max(decision_counts.items(), key=lambda x: x[1])[0]
    return estimated, decision_counts

# Evaluate accuracy across the dataset
def evaluate_baseline_all_titles(n=None, k=10):
    correct = 0
    total = 0
    confusion = Counter()

    sample_range = range(len(titles)) if n is None else range(min(n, len(titles)))

    for i in sample_range:
        query_title = titles[i]
        query_pid = paper_ids[i]
        true_raw = paper_decisions.get(query_pid, "Unknown")
        true_decision = true_raw

        pred_decision, _ = estimate_baseline_decision(query_title, k)
        
        if pred_decision == true_decision:
            correct += 1
        confusion[(true_decision, pred_decision)] += 1
        total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"\n✅ Baseline Accuracy (title only, top-{k}): {accuracy:.2f}")
    print("\n📊 Confusion Matrix (True → Predicted):")
    for (true, pred), count in confusion.items():
        print(f"{true:25s} → {pred:25s}: {count}")

    return accuracy, confusion


In [None]:
evaluate_baseline_all_titles(k=1)


✅ Baseline Accuracy (title only, top-1): 0.43

📊 Confusion Matrix (True → Predicted):
Accept (Poster)           → Accept                   : 488
Accept (Poster)           → Accept (Poster)          : 364
Accept (Poster)           → Reject                   : 572
Accept (Poster)           → Accept (Oral)            : 28
Accept (Oral)             → Accept (Poster)          : 20
Accept (Poster)           → Invite to Workshop Track : 29
Accept (Oral)             → Accept                   : 22
Accept (Oral)             → Reject                   : 19
Accept (Poster)           → Accept (Spotlight)       : 14
Invite to Workshop Track  → Accept                   : 46
Invite to Workshop Track  → Reject                   : 55
Invite to Workshop Track  → Accept (Poster)          : 29
Invite to Workshop Track  → Invite to Workshop Track : 5
Reject                    → Reject                   : 1391
Reject                    → Accept                   : 1087
Reject                    → Accept (P

(0.43463535786859336,
 Counter({('Accept', 'Accept'): 2089,
          ('Reject', 'Reject'): 1391,
          ('Reject', 'Accept'): 1087,
          ('Accept', 'Reject'): 959,
          ('Reject', 'Accept (Poster)'): 697,
          ('Accept (Poster)', 'Reject'): 572,
          ('Accept', 'Accept (Poster)'): 492,
          ('Accept (Poster)', 'Accept'): 488,
          ('Accept (Poster)', 'Accept (Poster)'): 364,
          ('Invite to Workshop Track', 'Reject'): 55,
          ('Accept', 'Invite to Workshop Track'): 55,
          ('Reject', 'Invite to Workshop Track'): 53,
          ('Invite to Workshop Track', 'Accept'): 46,
          ('Accept (Spotlight)', 'Reject'): 44,
          ('Reject', 'Accept (Oral)'): 41,
          ('Accept (Spotlight)', 'Accept'): 37,
          ('Reject', 'Accept (Spotlight)'): 35,
          ('Accept (Poster)', 'Invite to Workshop Track'): 29,
          ('Invite to Workshop Track', 'Accept (Poster)'): 29,
          ('Reject', 'Accept (Talk)'): 29,
          ('Acce