Import e Accelerator

In [1]:
from datasets import load_from_disk
xsum = load_from_disk("/Users/hiamrob/Downloads")

In [18]:
xsum["validation"][1]

{'document': 'Voges was forced to retire hurt on 86 after suffering the injury while batting during the County Championship draw with Somerset on 4 June.\nMiddlesex hope to have the Australian back for their T20 Blast game against Hampshire at Lord\'s on 3 August.\nThe 37-year-old has scored 230 runs in four first-class games this season at an average of 57.50.\n"Losing Adam is naturally a blow as he contributes significantly to everything we do," director of cricket Angus Fraser said.\n"His absence, however, does give opportunities to other players who are desperate to play in the first XI.\n"In the past we have coped well without an overseas player and I expect us to do so now."\nDefending county champions Middlesex are sixth in the Division One table, having drawn all four of their matches this season.\nVoges retired from international cricket in February with a Test batting average of 61.87 from 31 innings, second only to Australian great Sir Donald Bradman\'s career average of 99.

### Baseline Lead-3 + TF-IDF

Tokenizer: nltk

In [23]:
from datasets import load_from_disk
from datasets import Dataset
from evaluate import load
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt')

# ---------------------- #
# LEAD-3 Implementation
# ---------------------- #
def lead_3(example):
    sents = sent_tokenize(example["document"])
    lead_summary = " ".join(sents[:3])
    return {"lead3_pred": lead_summary}

xsum["test"] = xsum["test"].map(lead_3)

# ----------------------------- #
# TF-IDF + Cosine Similarity
# ----------------------------- #
# Step 1: fit vectorizer on all train sentences
all_sentences = []
for doc in xsum["train"]["document"]:
    all_sentences.extend(sent_tokenize(doc))

tfidf = TfidfVectorizer().fit(all_sentences)

# Step 2: apply to test set (frase vs centroide)
def tfidf_centroid_extract(example):
    sents = sent_tokenize(example["document"])
    if len(sents) <= 3:
        return {"tfidf_centroid_pred": example["document"]}

    vectors = tfidf.transform(sents)
    centroid = vectors.mean(axis=0)
    sims = cosine_similarity(vectors, np.asarray(centroid))

    top_idx = sims.ravel().argsort()[-3:][::-1]
    top_sents = [sents[i] for i in top_idx]
    return {"tfidf_centroid_pred": " ".join(top_sents)}

xsum["test"] = xsum["test"].map(tfidf_centroid_extract)

# ------------------------ #
# ROUGE Evaluation
# ------------------------ #
rouge = load("rouge")

# Lead-3 evaluation
lead_results = rouge.compute(
    predictions=xsum["test"]["lead3_pred"],
    references=xsum["test"]["summary"]
)
print("📊 ROUGE Lead-3:", lead_results)

# TF-IDF + centroid evaluation
tfidf_centroid_results = rouge.compute(
    predictions=xsum["test"]["tfidf_centroid_pred"],
    references=xsum["test"]["summary"]
)
print("📊 ROUGE TF-IDF (centroid):", tfidf_centroid_results)

[nltk_data] Downloading package punkt to /Users/hiamrob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

📊 ROUGE Lead-3: {'rouge1': 0.18386340527946998, 'rouge2': 0.025144759397878358, 'rougeL': 0.11909126678049202, 'rougeLsum': 0.1195221051011808}
📊 ROUGE TF-IDF (centroid): {'rouge1': 0.16948091042982166, 'rouge2': 0.029578252227835878, 'rougeL': 0.11656165841196645, 'rougeLsum': 0.11875775206866036}


### LSTM

Tokenizer: t5-small --> sub word, gestione oov, usa SentencePiece

In [4]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import T5TokenizerFast
from datasets import load_from_disk
from tqdm import tqdm

# ───────────────────────────────
# ⚙️ Config
# ───────────────────────────────
device = torch.device("cpu")
MAX_INPUT_LEN = 64
MAX_TARGET_LEN = 32
BATCH_SIZE = 1
EPOCHS = 1
EMBEDDING_DIM = 64
HIDDEN_DIM = 128

# ───────────────────────────────
# 📦 Dataset
# ───────────────────────────────

tokenizer = T5TokenizerFast.from_pretrained("t5-small")
train_split = xsum["train"].select(range(5000))         # puoi aumentare dopo test
val_split = xsum["validation"].select(range(1000))

class XSumDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_input_len, max_target_len):
        self.data = hf_dataset
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        doc = self.data[idx]["document"]
        summ = self.data[idx]["summary"]
        inputs = self.tokenizer(doc, padding='max_length', truncation=True,
                                max_length=self.max_input_len, return_tensors="pt")
        targets = self.tokenizer(summ, padding='max_length', truncation=True,
                                 max_length=self.max_target_len, return_tensors="pt")
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "target_ids": targets["input_ids"].squeeze(0)
        }

train_dataset = XSumDataset(train_split, tokenizer, MAX_INPUT_LEN, MAX_TARGET_LEN)
val_dataset   = XSumDataset(val_split, tokenizer, MAX_INPUT_LEN, MAX_TARGET_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=1, shuffle=False)

# ───────────────────────────────
# 🧠 LSTM Encoder-Decoder + Attention
# ───────────────────────────────
class EncoderLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class DecoderLSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_step, hidden, cell, encoder_outputs):
        embedded = self.embedding(input_step)  # (batch, 1, emb)
        attn_scores = torch.bmm(hidden[-1].unsqueeze(1), encoder_outputs.transpose(1, 2))
        attn_weights = torch.softmax(attn_scores, dim=-1)
        context = torch.bmm(attn_weights, encoder_outputs)
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.out(output.squeeze(1))
        return output, hidden, cell

vocab_size = tokenizer.vocab_size
encoder = EncoderLSTM(vocab_size, EMBEDDING_DIM, HIDDEN_DIM).to(device)
decoder = DecoderLSTMWithAttention(vocab_size, EMBEDDING_DIM, HIDDEN_DIM).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-3)

# ───────────────────────────────
# 🔁 Training loop
# ───────────────────────────────
for epoch in range(EPOCHS):
    encoder.train()
    decoder.train()
    epoch_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch["input_ids"].to(device)
        target_ids = batch["target_ids"].to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, hidden, cell = encoder(input_ids)

        loss = 0
        decoder_input = target_ids[:, 0].unsqueeze(1)  # <sos>
        for t in range(1, target_ids.size(1)):
            decoder_output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
            loss += criterion(decoder_output, target_ids[:, t])
            decoder_input = target_ids[:, t].unsqueeze(1)  # Teacher forcing

        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {epoch_loss:.2f}")

# ───────────────────────────────
# 🔎 Greedy Decoding
# ───────────────────────────────
def greedy_decode(encoder, decoder, input_ids, max_len=32):
    encoder_outputs, hidden, cell = encoder(input_ids)
    decoder_input = torch.tensor([[tokenizer.pad_token_id]]).to(device)  # <sos>
    output_ids = []

    for _ in range(max_len):
        decoder_output, hidden, cell = decoder(decoder_input, hidden, cell, encoder_outputs)
        next_token = decoder_output.argmax(1)
        output_ids.append(next_token.item())
        if next_token.item() == tokenizer.eos_token_id:
            break
        decoder_input = next_token.unsqueeze(1)

    return tokenizer.decode(output_ids, skip_special_tokens=True)

# ───────────────────────────────
# 🧪 Evaluation su validation
# ───────────────────────────────
encoder.eval()
decoder.eval()

for i, batch in enumerate(val_loader):
    input_ids = batch["input_ids"].to(device)
    target_ids = batch["target_ids"].to(device)
    generated = greedy_decode(encoder, decoder, input_ids)
    reference = tokenizer.decode(target_ids[0], skip_special_tokens=True)
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print(f"Input     : {input_text}")
    print(f"Generated : {generated}")
    print(f"Reference : {reference}")

Epoch 1: 100%|██████████| 5000/5000 [11:57<00:00,  6.97it/s]


Epoch 1 Loss: nan

--- Example 1 ---
Input     : The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport. Mr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42. Appearing
Generated : a new man has been a new-year-old-year-old-year-old-year-old-year-old-year-
Reference : Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.

--- Example 2 ---
Input     : Voges was forced to retire hurt on 86 after suffering the injury while batting during the County Championship draw with Somerset on 4 June. Middlesex hope to have the Australian back for their T20 Blast game against Hampshire at Lord's on 3 August. The 37-year-old has scored 
Generated : a new man has been a new-year-old-year-old-year-old-year-old-year-old-year-
Reference : Middlesex batsman Adam Voges will be out until August after suffering a torn calf 