# Phase 3 - Model Training and Evaluation

Goals for this phase:

1. Split the dataset into training, validation, and test sets.
2. Train the model on the training set and monitor its performance on the validation set. 
3. Evaluate the model on the test set to get a final estimate of its performance.

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split#, cross_val_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset into a Pandas DataFrame
df = pd.read_csv('data/news_summarization.csv') # expect this step to take about 30 seconds

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [3]:
# Define the function for evaluating the models
def evaluate_model(model_name, tokenizer, model, test_df):
    references = []
    predictions = []

    # Generate summaries for each test article
    for text in test_df['Content']:
        # Generate summary
        input_ids = tokenizer.encode(text, return_tensors='pt', truncation=True, padding=True)
        summary_ids = model.generate(input_ids, num_beams=4, max_length=50, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        # Append to lists for ROUGE and BLEU scores
        references.append([test_df[test_df['Content']==text]['Summary'].tolist()[0].split()])
        predictions.append(summary.split())

    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score_list(references, predictions)

    # Compute BLEU score
    bleu_score = corpus_bleu(references, predictions)

    # Compute METEOR score
    meteor_scores = [meteor_score(ref, pred) for ref, pred in zip(references, predictions)]

    # Create a dictionary of scores for this model
    scores_dict = {
        'Model': model_name,
        'ROUGE-1': rouge_scores['rouge1'].fmeasure,
        'ROUGE-2': rouge_scores['rouge2'].fmeasure,
        'ROUGE-L': rouge_scores['rougeL'].fmeasure,
        'BLEU': bleu_score,
        'METEOR': sum(meteor_scores) / len(meteor_scores)
    }

    return scores_dict

In [6]:
# Define the models to evaluate
models = [
    't5-base', 
    't5-3b', 
    'bart-large-cnn', 
    'bart-large-xsum', 
    'pegasus-large', 
    'microsoft/prophetnet-large-uncased',
    'google/mt5-large',
]
model_scores = []

# Evaluate each model
for model_name in tqdm(models):
    print(f'Evaluating {model_name}...')

    # Check if it's a T5 model
    if 't5' in model_name:
        tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    scores_dict = evaluate_model(model_name, tokenizer, model, test_df)
    model_scores.append(scores_dict)

Evaluating t5-base...


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


KeyboardInterrupt: 



In [None]:
# Compile the results into a Pandas dataframe
results_df = pd.DataFrame(model_scores)
results_df.set_index('Model', inplace=True)

# Print the results
print(results_df)

In [None]:
# # Define the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained('MODEL_NAME')
# model = AutoModelForSeq2SeqLM.from_pretrained('MODEL_NAME')

In [None]:
# # Define the function for cross-validation training
# def cross_validate_model(model, tokenizer, train_df):
#     # Prepare the data for training
#     train_encodings = tokenizer(train_df['Content'].tolist(), truncation=True, padding=True)
#     train_labels = tokenizer(train_df['Summary'].tolist(), truncation=True, padding=True)
#     train_encodings.pop("attention_mask")
#     train_encodings["labels"] = train_labels["input_ids"]
    
#     # Train the model using cross-validation
#     cv_scores = cross_val_score(model, train_encodings, cv=5)
    
#     # Print the cross-validation scores
#     print("Cross-validation scores:", cv_scores)
#     print("Average score:", sum(cv_scores) / len(cv_scores))

In [None]:
# # Call the function for cross-validation training with each model
# models = ['bert-base-uncased', 't5-base', 'EleutherAI/gpt-j-6B', 'EleutherAI/gpt-neo-1.3B']
# for model_name in models:
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
#     print(f"Model: {model_name}")
#     cross_validate_model(model, tokenizer, train_df)