In [None]:
# Install required packages (uncomment if running in a fresh environment)
# %pip install -q transformers datasets evaluate accelerate sentencepiece sacrebleu rouge-score

In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset, load_metric
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

os.makedirs('processed', exist_ok=True)

In [None]:
# Load CSV and prepare text pairs
df = pd.read_csv('dataset/MTS-Dialog-TrainingSet.csv')
print('Rows:', len(df))

# Use cleaned columns if available, otherwise fallback to raw dialogue/section_text
if 'dialog_clean' in df.columns:
    inputs = df['dialog_clean'].fillna('').astype(str).tolist()
else:
    inputs = df['dialogue'].fillna('').astype(str).tolist()

if 'section_text_clean' in df.columns:
    targets = df['section_text_clean'].fillna('').astype(str).tolist()
else:
    targets = df['section_text'].fillna('').astype(str).tolist()

# Quick sanity: filter out empty pairs
pairs = [(i, t) for i, t in zip(inputs, targets) if str(i).strip() and str(t).strip()]
print(f'Usable pairs: {len(pairs)}')
inputs, targets = zip(*pairs)

# Train/val split
train_inputs, val_inputs, train_targets, val_targets = train_test_split(list(inputs), list(targets), test_size=0.12, random_state=SEED)
print('Train size:', len(train_inputs), 'Val size:', len(val_inputs))

raw_train = Dataset.from_dict({'dialogue': train_inputs, 'summary': train_targets})
raw_val = Dataset.from_dict({'dialogue': val_inputs, 'summary': val_targets})

In [None]:
# Model & tokenizer selection
model_name = 't5-small'  # change to 't5-base' or other model if you have resources
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Prefix used by T5 for summarization tasks
prefix = 'summarize: '

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples['dialogue']]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    # Tokenize targets with the `text_target` argument (newer tokenizers) or encode normally
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Map the datasets
train_dataset = raw_train.map(preprocess_function, batched=True, remove_columns=['dialogue', 'summary'])
val_dataset = raw_val.map(preprocess_function, batched=True, remove_columns=['dialogue', 'summary'])

print(train_dataset)
print(val_dataset)

In [None]:
# Data collator and metric
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import evaluate
rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline separated sentences
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=['rouge1','rouge2','rougeL'])
    # extract mid F1 scores and return
    return {k: round(v.mid.fmeasure, 4) for k, v in result.items()}

In [None]:
# Training arguments - tune to your hardware
training_args = Seq2SeqTrainingArguments(
    output_dir='processed/t5_summarization',
    evaluation_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    logging_strategy='steps',
    logging_steps=200,
    save_strategy='epoch',
    num_train_epochs=3,
    fp16=False,  # set True if you have a GPU with mixed precision
    seed=SEED,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='rouge1',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training (uncomment to run)
# trainer.train()
# trainer.save_model('processed/t5_summarization_best')
# tokenizer.save_pretrained('processed/t5_summarization_best')

print('Trainer is ready. To launch training uncomment trainer.train()')

## Inference example
After training and saving the model, you can run the cell below to load the trained model and generate summaries for new dialogues.

In [None]:
# Load trained model and tokenizer (if saved)
# model = T5ForConditionalGeneration.from_pretrained('processed/t5_summarization_best')
# tokenizer = AutoTokenizer.from_pretrained('processed/t5_summarization_best')

def generate_summary(text, max_length=120, num_beams=4):
    input_text = prefix + text
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=max_input_length).to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=max_length, num_beams=num_beams, early_stopping=True)
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Example (replace with a real dialogue)
# sample_dialog = df['dialog_clean'].iloc[0]
# print('INPUT:', sample_dialog)
# print('GENERATED SUMMARY:', generate_summary(sample_dialog))
print('Inference cell ready. Load the saved model and run generate_summary(...)')

## Tips & Next steps
- If dialogues are long, consider a model that supports longer inputs (LongT5, Longformer + BART) or a hierarchical approach (chunk + summarize + merge).
- Use mixed precision (`fp16=True`) if GPU supports it to speed up training.
- Monitor validation ROUGE and save best checkpoint; early stopping helps avoid overfitting.
- Optionally pre-train (continued pretraining) the LM on your corpus before fine-tuning for summarization.