# Load ICLR and DeepReview Datasets

This notebook loads two datasets:
1. **ICLR Reviews 2020-2026** - Our custom dataset with 40,376 submissions
2. **DeepReview-13K** - WestlakeNLP's dataset with ~13K review examples

In [15]:
from datasets import DatasetDict, Dataset, load_dataset, load_from_disk
import pandas as pd
from pathlib import Path

## 1. Load ICLR Reviews 2020-2026

In [None]:
# Load from disk - single combined dataset
iclr_path = Path("../data/hf_dataset")

from datasets import Dataset

iclr_dataset = Dataset.load_from_disk(iclr_path)
print(f"ICLR Dataset: {len(iclr_dataset):,} rows")

# Show breakdown by year
year_counts = {}
for row in iclr_dataset:
    y = row['year']
    year_counts[y] = year_counts.get(y, 0) + 1

print("\nBy year:")
for year in sorted(year_counts.keys()):
    print(f"  {year}: {year_counts[year]:,} rows")

In [None]:
# View schema
print("ICLR Dataset features:")
for name, feat in iclr_dataset.features.items():
    print(f"  {name}: {feat}")

In [None]:
# Example row
example = iclr_dataset[0]
print(f"Example row:")
print(f"  ID: {example['submission_id']}")
print(f"  Year: {example['year']}")
print(f"  Title: {example['title'][:100]}...")
print(f"  Link: {example['openreview_link']}")

## 2. Load DeepReview-13K

In [4]:
# Path to cached DeepReview dataset
deepreview_cache = Path("/n/fs/vision-mix/sk7524/caches/.hf/hub/datasets--WestlakeNLP--DeepReview-13K")
deepreview_data = deepreview_cache / "snapshots/3db597e1e789ce04af98c5eae9e9430341face23/data"

# Load CSVs
train_df = pd.read_csv(deepreview_data / "train.csv", usecols=['year', 'id', 'mode', 'rating', 'decision, 'inputs'])
test_2024_df = pd.read_csv(deepreview_data / "test_2024.csv", usecols=['year', 'id', 'mode', 'rating', 'decision'])
test_2025_df = pd.read_csv(deepreview_data / "test_2025.csv", usecols=['year', 'id', 'mode', 'rating', 'decision'])

print("DeepReview-13K splits:")
print(f"  train: {len(train_df):,} rows")
print(f"  test_2024: {len(test_2024_df):,} rows")
print(f"  test_2025: {len(test_2025_df):,} rows")
print(f"  Total: {len(train_df) + len(test_2024_df) + len(test_2025_df):,} rows")

DeepReview-13K splits:
  train: 40,137 rows
  test_2024: 652 rows
  test_2025: 634 rows
  Total: 41,423 rows


In [13]:
train_df

Unnamed: 0,year,id,mode,rating,decision
0,2024,wCUw8t63vH,fast,"[6, 6, 8, 8, 6]",Reject
1,2024,wCUw8t63vH,standard,"[6, 6, 8, 8, 6]",Reject
2,2024,wCUw8t63vH,best,"[6, 6, 8, 8, 6]",Reject
3,2024,wCRTEOIdmf,fast,"[5, 5, 3]",Reject
4,2024,wCRTEOIdmf,standard,"[5, 5, 3]",Reject
...,...,...,...,...,...
40132,2025,dNunnVB4W6,standard,"[5, 6, 8, 6]",Accept
40133,2025,dNunnVB4W6,best,"[5, 6, 8, 6]",Accept
40134,2025,YWTpBisnwd,fast,"[6, 5, 5, 5, 3]",Reject
40135,2025,YWTpBisnwd,standard,"[6, 5, 5, 5, 3]",Reject


In [None]:
# View columns
print("\nDeepReview-13K columns:")
for col in train_df.columns:
    print(f"  {col}")

In [None]:
# Convert to HuggingFace Dataset
deepreview_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test_2024": Dataset.from_pandas(test_2024_df),
    "test_2025": Dataset.from_pandas(test_2025_df),
})

print("\nConverted to HuggingFace DatasetDict:")
print(deepreview_dataset)

In [None]:
# Convert to HuggingFace Dataset
deepreview_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test_2024": Dataset.from_pandas(test_2024_df),
    "test_2025": Dataset.from_pandas(test_2025_df),
})

print("\nConverted to HuggingFace DatasetDict:")
print(deepreview_dataset)

In [None]:
# Example row
example = train_df.iloc[0]
print(f"\nExample from train:")
print(f"  ID: {example['id']}")
print(f"  Year: {example['year']}")
print(f"  Mode: {example['mode']}")
print(f"  Rating: {example['rating']}")
print(f"  Decision: {example['decision']}")
print(f"  Inputs (first 200 chars): {example['inputs'][:200]}...")

## 3. Alternative: Load DeepReview from HuggingFace Hub

If you have internet access, you can also load directly:

In [None]:
# # Load from HuggingFace Hub (requires internet)
# deepreview_hf = load_dataset("WestlakeNLP/DeepReview-13K")
# print(deepreview_hf)

## 4. Summary

In [None]:
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"\nICLR Reviews 2020-2026: {len(iclr_dataset):,} submissions")
print(f"  - Years: 2020-2026 (single combined dataset with 'year' column)")
print(f"  - Contains: paper content paths, reviews, meta-reviews, decisions")
print(f"\nDeepReview-13K: {len(train_df) + len(test_2024_df) + len(test_2025_df):,} review examples")
print(f"  - Train: {len(train_df):,}")
print(f"  - Test 2024: {len(test_2024_df):,}")
print(f"  - Test 2025: {len(test_2025_df):,}")
print(f"  - Contains: input prompts, output reviews, ratings")