In [None]:
!pip install gdown pandas
import os
os.makedirs("data", exist_ok=True)

In [None]:
!gdown --id 1ExuBzkObUNmqgmeaVMF-OqnAuo9yuozK -O data/All_capped_keywords.zip

In [None]:
import zipfile

with zipfile.ZipFile("data/All_capped_keywords.zip", "r") as z:
    z.extractall("data/cspapersum")
    print("Extracted files:", len(z.namelist()))
    print("Sample files:", z.namelist()[:20])


In [None]:
import glob, pandas as pd

csvs = glob.glob("data/cspapersum/**/*.csv", recursive=True)
print("Found CSVs:", csvs[:5])

for path in csvs:
    try:
        df = pd.read_csv(path, nrows=3)
        print(path, "→", df.columns.tolist())
    except Exception as e:
        print("Could not load", path, e)


In [None]:
import pandas as pd

df = pd.read_csv("data/cspapersum/All_capped_keywords.csv")
df = df[['title', 'abstract']]

df.to_csv("data/titles_abstracts.csv", index=False)

print("Saved clean dataset.")
print("Number of rows: ", len(df))
df.head()

In [None]:
import pandas as pd, os
df = pd.read_csv("data/titles_abstracts.csv")
print("Full size (rows):", len(df))
sample = df.head(1000)                     # adjust if you want
sample_path = "data/titles_abstracts_sample.csv"
sample.to_csv(sample_path, index=False)
print("Wrote:", sample_path)


In [None]:
!pip -q install pandas numpy transformers
import re, pandas as pd, numpy as np
df = pd.read_csv("data/titles_abstracts.csv")
print(len(df), "rows"); df.head(3)

In [None]:
#Heuristic

PLAYFUL_WORDS = {
    #humor/slang
    "haha", "lol", "lmao", "rofl", "silly", "goofy", "whimsical", "jest", "chuckle", "giggle", "smirk", "banter", "slapstick", "sarcasm", "wit", "irony", "satire", "parody", "observational", "cringe", "deadpan", "absurd", "exaggeration", "understatement",
    #vibe
    "ridiculously", "insanely", "super", "hella", "wicked", "terribly", "unbelievably", "totally", "absolutely", "so", "dead", "mad", "lowkey", "legit", "literally", "actually", "for real", "for sure", "dude", "bro", "sick", "fire", "epic", "wild",
    #metaphors / tropes
    "hack","trick","cookbook","recipe","alchemy","wizard","beast","frankenstein",
    "zoo","odyssey","tales","story","magic","myth","saga","adventures",
    "to rule them all", "the good, the bad"
}

PATTERNS = [
    re.compile(r".+:\s+(?:a|the)\s+(?:tale|story|odyssey|cookbook)", re.I),
    re.compile(r"the good, the bad(?:, and the ugly)?", re.I),
    re.compile(r"to rule (?:them|it) all", re.I),
    re.compile(r"[;:?!]{2,}"),                  # multiple ? or !
    re.compile(r"[A-Za-z]+\s*&\s*[A-Za-z]+"),   # “X & Y”
    re.compile(r"\([^)]{0,30}\)"),              # brief parenthetical
    re.compile(r":\s*\)", re.I),                # smiley-ish ": )"
]
FORMAL_STARTS = tuple(["on the","towards","an analysis of","a study of","investigating","notes on"])

def rule_playful_score(title: str) -> float:
    t = str(title).strip()
    tl = t.lower()
    score = 0.0
    for w in PLAYFUL_WORDS:
        if w in tl: score += 1.0
    for p in PATTERNS:
        if p.search(t): score += 0.8
    if "!" in t: score += 0.5
    if "?" in t: score += 0.3
    if tl.startswith(FORMAL_STARTS): score -= 0.8
    return round(score, 3)

df["rule_score"] = df["title"].astype(str).apply(rule_playful_score)

# quick output check
df.sort_values("rule_score", ascending=False).head(8)[["title","rule_score"]]

In [None]:
!pip -q install transformers
from transformers import pipeline
import torch, numpy as np

# Config
HAS_GPU    = torch.cuda.is_available()
MODEL_NAME = "facebook/bart-large-mnli" if HAS_GPU else "typeform/distilbert-base-uncased-mnli"
DEVICE     = 0 if HAS_GPU else -1
BATCH_SIZE = 48 if HAS_GPU else 12

LABELS  = ["playful", "neutral"]
HYP     = "This paper title is {}."
ZS_THRESH = 0.60
USE_CASCADE = True
LOW, HIGH   = 0.8, 2.5   # "maybe" band for rule_score

zs = pipeline("zero-shot-classification", model=MODEL_NAME, device=DEVICE)

# Select which titles to score
if USE_CASCADE and "rule_score" in df.columns:
    mask = df["rule_score"].between(LOW, HIGH)
    idx = df.index[mask]
else:
    idx = df.index

titles = df.loc[idx, "title"].fillna("").astype(str).tolist()
if len(titles) == 0:
    # Nothing to score (maybe band empty)
    if "zs_playful_prob" not in df.columns:
        df["zs_playful_prob"] = 0.0
else:
    outs = zs(
        titles,
        candidate_labels=LABELS,
        hypothesis_template=HYP,
        multi_label=False,
        truncation=True,
        batch_size=BATCH_SIZE  # <-- key for speed
    )
    p_idx = outs[0]["labels"].index("playful")
    probs = np.array([float(o["scores"][p_idx]) for o in outs], dtype=float)

    if len(idx) != len(df):
        # cascade: fill only scored rows; others = 0.0
        if "zs_playful_prob" not in df.columns:
            df["zs_playful_prob"] = 0.0
        df.loc[idx, "zs_playful_prob"] = probs
    else:
        df["zs_playful_prob"] = probs

df["playful_zs_flag"] = df["zs_playful_prob"] >= ZS_THRESH
df.sort_values("zs_playful_prob", ascending=False).head(10)[["title","zs_playful_prob"]]


In [None]:
import math, os
if "rule_score" not in df.columns:
    df["rule_score"] = 0.0

def squash(x, center=1.5):
    return 1 / (1 + math.exp(-(x - center)))  # map rule_score ~ to [0,1]

df["rule_norm"] = df["rule_score"].apply(squash)

df["ensemble_score"] = 0.5*df["rule_norm"] + 0.5*df["zs_playful_prob"]

ENSEMBLE_THRESH = 0.55   # ↑ for precision, ↓ for recall
df["playful_flag"] = df["ensemble_score"] >= ENSEMBLE_THRESH

# quick stats
print(f"Flagged (ensemble): {df['playful_flag'].mean():.2%} of {len(df)} titles")

# save outputs
os.makedirs("results", exist_ok=True)
df[df["playful_flag"]].to_csv("results/playful_titles.csv", index=False)
df[~df["playful_flag"]].sample(min(30, (~df["playful_flag"]).sum()), random_state=7)\
  .to_csv("results/neutral_title_examples.csv", index=False)

print("Saved: results/playful_titles.csv and results/neutral_title_examples.csv")


In [None]:
!pip -q install -U "transformers>=4.41,<5" datasets accelerate rouge-score nltk unidecode

import os, re, numpy as np, pandas as pd, torch, nltk
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from rouge_score import rouge_scorer
from unidecode import unidecode

nltk.download("punkt")


In [None]:
# Load your clean data
df = pd.read_csv("data/titles_abstracts.csv").dropna(subset=["abstract","title"]).copy()

def clean(s):
    s = unidecode(str(s))
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["abstract"] = df["abstract"].apply(clean)
df["title"]    = df["title"].apply(clean)

# (Optional) keep a manageable subset for the first run
MAX_EXAMPLES = min(len(df), 4000)
df = df.sample(MAX_EXAMPLES, random_state=42).reset_index(drop=True)

# Split
split = int(0.9 * len(df))
train_df, valid_df = df.iloc[:split], df.iloc[split:]

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["abstract","title"]]),
    "validation": Dataset.from_pandas(valid_df[["abstract","title"]]),
})

# Tokenization
MODEL_NAME = "google/flan-t5-small"  # bump to flan-t5-base if you have more VRAM
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_IN, MAX_OUT = 512, 32
PROMPT_PREFIX = "Write a concise, specific academic paper title for the abstract.\nAbstract: "

def preprocess(batch):
    inputs  = [PROMPT_PREFIX + a for a in batch["abstract"]]
    targets = batch["title"]
    model_inputs = tok(inputs, max_length=MAX_IN, truncation=True)
    with tok.as_target_tokenizer():
        labels = tok(targets, max_length=MAX_OUT, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = ds.map(preprocess, batched=True, remove_columns=["abstract","title"])


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tok, model=model)

args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/flan_t5_sft",
    per_device_train_batch_size=8,       # lower to 4 if OOM
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=2,
    warmup_ratio=0.05,
    logging_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),      # mixed precision on GPU
    report_to=[],                        # no wandb
    # NOTE: we skip evaluation_strategy/eval_steps to avoid version issues.
    # We'll evaluate manually right after training.
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],   # present but no auto-eval during training
    data_collator=data_collator,
    tokenizer=tok,
)

trainer.train()


In [None]:
# Helper: generate one title
def gen_title(abstract, max_new_tokens=32, num_beams=4):
    inp = tok(PROMPT_PREFIX + abstract, return_tensors="pt", truncation=True, max_length=MAX_IN).to(model.device)
    out = model.generate(**inp, max_new_tokens=32, num_beams=num_beams)
    return tok.decode(out[0], skip_special_tokens=True).strip()

# Sample N examples from validation
N = min(200, len(valid_df))
sample = valid_df.head(N).copy()
sample["gen_title"] = sample["abstract"].apply(gen_title)

# ROUGE-L (rough overlap signal)
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
sample["rougeL"] = sample.apply(lambda r: scorer.score(r["title"], r["gen_title"])["rougeL"].fmeasure, axis=1)

print("Mean ROUGE-L on sample:", round(sample["rougeL"].mean(), 3))
sample.head(5)[["title","gen_title","rougeL"]]


In [None]:
os.makedirs("results", exist_ok=True)
sample.to_csv("results/sft_titles_validation_sample.csv", index=False)
trainer.save_model("checkpoints/flan_t5_sft/best")
print("Saved:")
print("- results/sft_titles_validation_sample.csv")
print("- checkpoints/flan_t5_sft/best (model + tokenizer)")


In [None]:
This cell strips widget metadata and outputs to create a *clean* copy of the notebook
so GitHub can render it. It **does not affect any results**. You can ignore it when
reading the notebook.

To publish: run this cell → it writes a cleaned `.ipynb` which I upload/replace in the repo.


In [None]:
# Clean a Colab notebook so GitHub can render it (removes metadata.widgets + outputs)
!pip -q install nbformat

from google.colab import drive
drive.mount('/content/drive')

import nbformat, glob, os, time, shutil
from google.colab import files

# 1) Find candidate .ipynb files in Drive (most recent first)
cands = sorted(
    glob.glob('/content/drive/MyDrive/**/*.ipynb', recursive=True),
    key=os.path.getmtime, reverse=True
)

if not cands:
    raise SystemExit("No .ipynb found in Drive. Save your notebook to Drive first: File → Save a copy in Drive.")

print("Top candidates:")
for i, p in enumerate(cands[:10]):
    print(f"[{i}] {time.ctime(os.path.getmtime(p))}  {p}")

# 2) Pick the most recent (change idx if you want a different one)
idx = 0
SRC = cands[idx]
print("\nUsing:", SRC)

nb = nbformat.read(SRC, as_version=4)

# remove notebook-level widgets metadata
nb.metadata.pop("widgets", None)

# remove cell-level widgets metadata and outputs
for cell in nb.cells:
    if isinstance(cell.get("metadata"), dict):
        cell.metadata.pop("widgets", None)
    if cell.get("cell_type") == "code":
        cell["outputs"] = []
        cell["execution_count"] = None

clean_name = "cleaned_" + os.path.basename(SRC)
DST = f"/content/{clean_name}"
nbformat.write(nb, DST)
print("Wrote cleaned notebook:", DST)

# 4) Offer download so you can upload to GitHub
files.download(DST)
