In [None]:
import csv, random
from pathlib import Path
import requests
from collections import Counter

from pathlib import Path
ROOT = Path().resolve()
DATA = ROOT / "data"
DATA.mkdir(exist_ok=True)
TRAIN_URL = "https://cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label"
TEST_URL  = "https://cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label"

In [None]:
def fetch(url, out_path):
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    out_path.write_bytes(r.content)

train_path = DATA / "train_5500.label"
test_path  = DATA / "TREC_10.label"

fetch(TRAIN_URL, train_path)
fetch(TEST_URL, test_path)

(train_path.exists(), test_path.exists())

(True, True)

In [None]:
def read_lines(path):
    try:
        return path.read_text(encoding="utf-8").splitlines()
    except UnicodeDecodeError:
        return path.read_text(encoding="latin-1").splitlines()

def parse_trec_lines(lines):
    rows = []
    for line in lines:
        line = line.strip()
        if not line or " " not in line:
            continue
        label_full, text = line.split(" ", 1)
        coarse = label_full.split(":", 1)[0]
        rows.append((text, coarse))
    return rows

train_data = parse_trec_lines(read_lines(train_path))
test_data  = parse_trec_lines(read_lines(test_path))

random.seed(42)
random.shuffle(train_data)
n_val = int(len(train_data) * 0.2)
val_data = train_data[:n_val]
final_train = train_data[n_val:]

len(final_train), len(val_data), len(test_data)

(4362, 1090, 500)

In [None]:
def write_csv(path, rows):
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f); w.writerow(["text","label"]); w.writerows(rows)

write_csv(DATA / "trec_train.csv", final_train)
write_csv(DATA / "trec_val.csv",   val_data)
write_csv(DATA / "trec_test.csv",  test_data)

def stats(name, rows):
    c = Counter(y for _, y in rows)
    return name, len(rows), dict(sorted(c.items()))

stats("train", final_train), stats("val", val_data), stats("test", test_data)

(('train',
  4362,
  {'ABBR': 66, 'DESC': 906, 'ENTY': 984, 'HUM': 1000, 'LOC': 667, 'NUM': 739}),
 ('val',
  1090,
  {'ABBR': 20, 'DESC': 256, 'ENTY': 266, 'HUM': 223, 'LOC': 168, 'NUM': 157}),
 ('test',
  500,
  {'ABBR': 9, 'DESC': 138, 'ENTY': 94, 'HUM': 65, 'LOC': 81, 'NUM': 113}))