In [None]:
!pip -q install transformers accelerate datasets evaluate scikit-learn gdown

In [None]:
import os, json, math, numpy as np, pandas as pd, torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
import evaluate, gdown

In [None]:
os.makedirs("data", exist_ok=True)
SOURCES = {
    "paper": "Devlin et al., 2019, BERT",
    "code": "google-research/bert",
    "hf_model": "bert-base-uncased"
}
print(json.dumps(SOURCES, indent=2))

{
  "paper": "Devlin et al., 2019, BERT",
  "code": "google-research/bert",
  "hf_model": "bert-base-uncased"
}


In [None]:
TRAIN_URL = "https://drive.google.com/file/d/1uPj_AB2OFkM_W0Gyt7hQYOZgur3VgEMn/view?usp=sharing"
DEV_URL   = "https://drive.google.com/file/d/17prMQSyDU3E5snoQ_mbyBY93f0r-XPZt/view?usp=sharing"
TEST_URL  = "https://drive.google.com/file/d/1hcBJQVq4637iwfmQSpKl6q7iiRRiBTqK/view?usp=sharing"

SMALL_TRAIN_URL = "https://drive.google.com/file/d/1KMxYXb94WKajr6W2ff5uA-rA9f07aS5u/view?usp=sharing"
SMALL_DEV_URL   = "https://drive.google.com/file/d/1-qUxMo-LBRQEBEcZkWNhieYcGZj1E7Ow/view?usp=sharing"
SMALL_TEST_URL  = "https://drive.google.com/file/d/1D2JT_m2w-QNm0UMGPdrAlFPqWDKIe84q/view?usp=sharing"

def fetch(url, out):
    if not os.path.exists(out):
        gdown.download(url, out, fuzzy=True, quiet=False)

use_small = False  # set to True for a quick smoke test
if use_small:
    fetch(SMALL_TRAIN_URL, "data/train.csv")
    fetch(SMALL_DEV_URL,   "data/dev.csv")
    fetch(SMALL_TEST_URL,  "data/test.csv")
else:
    fetch(TRAIN_URL, "data/train.csv")
    fetch(DEV_URL,   "data/dev.csv")
    fetch(TEST_URL,  "data/test.csv")

train_df = pd.read_csv("data/train.csv", usecols=["review_id","text","stars"])
dev_df   = pd.read_csv("data/dev.csv",   usecols=["review_id","text","stars"])
test_df  = pd.read_csv("data/test.csv",  usecols=["review_id","text","stars"])

In [None]:
MAX_TRAIN_ROWS = 100000  # BERT paper used in literature review used 96.1K samples
if MAX_TRAIN_ROWS is not None and len(train_df) > MAX_TRAIN_ROWS:
    train_df = train_df.sample(n=MAX_TRAIN_ROWS, random_state=42)

# 5-Fold CV on 96K samples means we should have about ~20K for testing and dev

MAX_DEV_ROWS = 20000
if MAX_DEV_ROWS is not None and len(train_df) > MAX_DEV_ROWS:
    dev_df = dev_df.sample(n=MAX_DEV_ROWS, random_state=42)
MAX_TEST_ROWS = 25000
if MAX_TEST_ROWS is not None and len(train_df) > MAX_TEST_ROWS:
    test_df = test_df.sample(n=MAX_TEST_ROWS, random_state=42)

In [None]:
train_df["label"] = train_df["stars"] - 1
dev_df["label"]   = dev_df["stars"] - 1
test_df["label"]  = test_df["stars"] - 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

In [None]:

train_ds = Dataset.from_pandas(train_df[["review_id","text","label"]], preserve_index=False)
dev_ds   = Dataset.from_pandas(dev_df[["review_id","text","label"]],     preserve_index=False)
test_ds  = Dataset.from_pandas(test_df[["review_id","text","label"]],    preserve_index=False)

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
dev_ds   = dev_ds.map(tokenize_fn,   batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tokenize_fn,  batched=True, remove_columns=["text"])


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def qwk(y_true, y_pred, k=5):
    O = confusion_matrix(y_true, y_pred, labels=list(range(k))).astype(np.float64)
    N = O.sum()
    w = np.zeros((k,k))
    for i in range(k):
        for j in range(k):
            w[i,j] = ((i-j)**2) / ((k-1)**2)
    act_hist = O.sum(axis=1)
    pred_hist = O.sum(axis=0)
    E = np.outer(act_hist, pred_hist) / N
    num = (w * O).sum()
    den = (w * E).sum()
    return 1.0 - num/den if den > 0 else 0.0

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    mae = np.mean(np.abs((preds+1) - (labels+1)))
    kappa = qwk(labels, preds, k=5)
    return {"accuracy": acc, "macro_f1": macro_f1, "mae": mae, "qwk": kappa}

In [None]:
id2label = {i: str(i+1) for i in range(5)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
def train_once(bs, lr, epochs, outdir):
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=5,
        id2label=id2label,
        label2id=label2id
    )
    args = TrainingArguments(
        output_dir=outdir,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=bs,
        learning_rate=lr,
        num_train_epochs=epochs,
        eval_strategy="epoch", # Changed from evaluation_strategy to eval_strategy
        save_strategy="epoch",
        metric_for_best_model="macro_f1",
        load_best_model_at_end=True,
        fp16=True,
        logging_steps=100,
        report_to="none",
        seed=42
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=dev_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer.train()
    metrics = trainer.evaluate()
    with open(os.path.join(outdir, "dev_metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)
    return trainer, metrics

In [None]:
os.makedirs("models", exist_ok=True)
grid = []
for bs in [16,32]:
    for lr in [2e-5,3e-5,5e-5]:
        epochs = 3
        outdir = f"models/bert_bs{bs}_lr{lr}_ep{epochs}"
        trainer, metrics = train_once(bs, lr, epochs, outdir)
        grid.append((metrics["eval_macro_f1"], outdir))

best_dir = sorted(grid, key=lambda x: x[0], reverse=True)[0][1]
print({"best_checkpoint": best_dir})

# Andrew, currently impl at 4:26 gets 5.23 it/s

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Mae,Qwk
1,0.6331,0.619183,0.73805,0.64615,0.2939,0.915436
2,0.5,0.633531,0.74085,0.662017,0.287,0.917239
3,0.3853,0.700423,0.73755,0.656824,0.2904,0.916817


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
best_trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained(best_dir),
    args=TrainingArguments(output_dir=best_dir, per_device_eval_batch_size=32, report_to="none"),
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:

def write_predictions(trainer, ds, ids, out_csv):
    preds = trainer.predict(ds).predictions
    yhat = np.argmax(preds, axis=-1) + 1
    pd.DataFrame({"review_id": ids, "pred": yhat}).to_csv(out_csv, index=False)

In [None]:
os.makedirs("predictions", exist_ok=True)

In [None]:
dev_logits = best_trainer.predict(dev_ds).predictions

In [None]:
dev_pred = np.argmax(dev_logits, axis=-1)

In [None]:
cm = confusion_matrix(dev_ds["label"], dev_pred, labels=list(range(5)))

In [None]:
pd.DataFrame(cm, index=[1,2,3,4,5], columns=[1,2,3,4,5]).to_csv("predictions/bert_dev_confusion_matrix.csv", index=True)

write_predictions(best_trainer, dev_ds, dev_df["review_id"],  "predictions/bert_dev.csv")
write_predictions(best_trainer, test_ds, test_df["review_id"], "predictions/bert_test.csv")

with open("predictions/bert_readme.json","w") as f:
    json.dump({
        "sources": SOURCES,
        "best_checkpoint": best_dir,
        "max_length": 256,
        "selection_metric": "macro_f1",
        "label_mapping": "stars -> label = stars-1"
    }, f, indent=2)


A faithful BERT fine-tune “according to the BERT paper” ranges, on Yelp splits.
Saved model at models/bert_bs_lr_ep3 with the best checkpoint path in the log.
predictions/bert_dev.csv and predictions/bert_test.csv with columns [review_id, pred] where pred is 1–5.
predictions/bert_dev_confusion_matrix.csv, plus a metrics JSON under the best checkpoint.

Notes
Model: BERT base uncased from Devlin et al., 2019, initialized from the official weights via Hugging Face.
Hyperparameters searched: batch size {16, 32}, learning rate {2e-5, 3e-5, 5e-5}, epochs 3, max length 256, dropout 0.1 (built in).
Selection: best Macro-F1 on the dev split.
Metrics reported: Accuracy, Macro-F1, MAE, Quadratic Weighted Kappa, and the 5x5 confusion matrix.
