Run ```conda env create -f environment.yml```
then ```conda activate cs-181-final```

In [1]:
import pandas as pd
import numpy as np
import re

# Read in data
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/val.csv')

# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean

train_df['cleaned_text'] = train_df['snip'].apply(clean_html)
val_df['cleaned_text'] = val_df['snip'].apply(clean_html)
print(train_df)

                                                    snip   channel  \
0      first of all, it feels like covid again but in...  FOXNEWSW   
1      to be a software drivenrganization where softw...     CSPAN   
2      you discuss the <b>power</b> <b>of</b> <em>ai<...    CSPAN2   
3      <em>ai</em> <b>bots</b> <b>like</b> chatgpt an...   BBCNEWS   
4      . >> i could sleep <b>ten</b> <b>hours</b> <em...  FOXNEWSW   
...                                                  ...       ...   
19868  cardiovascular science, but they're also pione...  FOXNEWSW   
19869  <b>i</b> <b>of</b> <em>ai</em> <b>in</b> <b>di...   BBCNEWS   
19870  weighing down on the major averages, both tech...      KTVU   
19871  i also <b>think</b> <b>crypto</b> <em>ai</em> ...    CSPAN2   
19872  as we have worked to monitor the adoption iden...    CSPAN2   

                                            cleaned_text  
0      first of all, it feels like covid again but in...  
1      to be a software drivenrganization

In [5]:
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset, DatasetDict
import evaluate

# 1) Prepare your texts & labels
texts  = train_df["cleaned_text"].astype(str).tolist()
labels = train_df["channel"].tolist()

le         = LabelEncoder()
int_labels = le.fit_transform(labels)
id2label   = {i: l for i, l in enumerate(le.classes_)}
label2id   = {l: i for i, l in id2label.items()}

# 2) Split & wrap as 🤗 Datasets
tr_txt, val_txt, tr_lbl, val_lbl = train_test_split(
    texts, int_labels,
    test_size=0.1,
    random_state=42,
    stratify=int_labels,
)

dataset = DatasetDict({
    "train":      Dataset.from_dict({"text": tr_txt,  "label": tr_lbl}),
    "validation": Dataset.from_dict({"text": val_txt, "label": val_lbl}),
})

# 3) Tokenize
MODEL_NAME = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True)

dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

# 4) Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(le.classes_),
    id2label=id2label,
    label2id=label2id,
)

# 5) Setup TrainingArguments (modern API only)

training_args = TrainingArguments(
    output_dir="news-channel-clf",

    # ← use these names
    eval_strategy="epoch",             # run eval once per epoch
    save_strategy="epoch",             # checkpoint once per epoch
    logging_strategy="steps",          # you’ll still control frequency via logging_steps
    load_best_model_at_end=True,       # keep the best checkpoint
    metric_for_best_model="accuracy",  # which metric to monitor

    # all your other hyperparams…
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,

    # how often to log
    logging_steps=100,

    # turn off external logging
    report_to="none",
)

# 6) Metrics with the evaluate library
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.from_numpy(logits), dim=-1)
    return accuracy.compute(predictions=preds, references=labels)

# 7) Trainer & train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()


Map: 100%|██████████| 17885/17885 [00:02<00:00, 6903.17 examples/s]
Map: 100%|██████████| 1988/1988 [00:00<00:00, 7753.69 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 13.02 GB, other allocations: 5.10 GB, max allowed: 18.13 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).