In [3]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
import torch
from glob import glob
import os
import json
from transformers import T5Tokenizer

In [4]:
# Check if GPU is available
if not torch.cuda.is_available():
    print("GPU not available. Make sure to configure a compatible GPU environment.")
else:
    print("GPU is available!")


GPU is available!


In [5]:
# Set up data directory and file paths
data_dir = "/kaggle/input/vt-ssum-mainn/VT-SSum-main/test"
all_files = glob(os.path.join(data_dir, '*.json'))

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to("cuda")

In [6]:
# Function to preprocess each file
def preprocess_file(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    segments = [" ".join(segment) for segment in data.get("segmentation", [])]
    input_text = " ".join(segments)
    summaries = []
    summarization_data = data.get("summarization", {})
    for clip_key, clip_data in summarization_data.items():
        if clip_data.get("is_summarization_sample"):
            summary = " ".join([sent["sent"] for sent in clip_data["summarization_data"] if sent["label"] == 1])
            if summary:
                summaries.append({"input_text": input_text, "summary_text": summary})
    return summaries


In [7]:
# Preprocess all files and create the dataset
all_data = []
for file in all_files:
    all_data.extend(preprocess_file(file))

In [8]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_dict({
    "input_text": [item["input_text"] for item in all_data],
    "summary_text": [item["summary_text"] for item in all_data]
})

In [9]:
# Split into training and evaluation datasets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [10]:
# Function to tokenize the data
def tokenize_data(example):
    input_encodings = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    target_encodings = tokenizer(example["summary_text"], truncation=True, padding="max_length", max_length=128)

    # Set labels as target input_ids and replace padding tokens with -100 for ignored tokens
    labels = target_encodings["input_ids"]
    labels = [-100 if token == tokenizer.pad_token_id else token for token in labels]

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels
    }

In [11]:
# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_data, batched=True)
eval_dataset = eval_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/10344 [00:00<?, ? examples/s]

Map:   0%|          | 0/2587 [00:00<?, ? examples/s]

In [12]:
# Verify the column names before removing any columns
print("Train Dataset Columns:", train_dataset.column_names)
print("Eval Dataset Columns:", eval_dataset.column_names)

Train Dataset Columns: ['input_text', 'summary_text', 'input_ids', 'attention_mask', 'labels']
Eval Dataset Columns: ['input_text', 'summary_text', 'input_ids', 'attention_mask', 'labels']


In [13]:
# Remove unnecessary columns while keeping required ones
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in ["input_ids", "attention_mask", "labels"]])
eval_dataset = eval_dataset.remove_columns([col for col in eval_dataset.column_names if col not in ["input_ids", "attention_mask", "labels"]])


In [14]:
# Set the dataset format to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [15]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training if supported by GPU
    logging_dir="./logs",
    save_steps=500,
    eval_steps=500,
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    report_to="none"  # Prevents unnecessary logging to external platforms
)


In [17]:
# Add an early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[early_stopping]
)


In [18]:
# Train the model
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss
500,1.4979,1.155013
1000,1.1859,1.145362
1500,1.1682,1.142518
2000,1.1554,1.140511


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2415, training_loss=1.2342984343661039, metrics={'train_runtime': 15021.5569, 'train_samples_per_second': 10.329, 'train_steps_per_second': 0.161, 'total_flos': 9.405232828121088e+16, 'train_loss': 1.2342984343661039, 'epoch': 14.930448222565687})

In [19]:

# Evaluate the model
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation Metrics: {'eval_loss': 1.1405110359191895, 'eval_runtime': 97.8893, 'eval_samples_per_second': 26.428, 'eval_steps_per_second': 1.655, 'epoch': 14.930448222565687}


In [None]:
import os
import json
from glob import glob
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

# Define paths and load model/tokenizer
test_data_dir = "/kaggle/input/vt-ssum-mainn/VT-SSum-main/test"
model_path = "/kaggle/working/results/checkpoint-2415"  # Replace with actual model path if needed

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained(model_path).to("cuda")

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Function to evaluate each JSON file
def evaluate_json_file(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)

    # Extract input and target summaries
    segments = [" ".join(segment) for segment in data.get("segmentation", [])]
    input_text = " ".join(segments)

    summarization_data = data.get("summarization", {})
    target_summaries = [
        " ".join([sent["sent"] for sent in clip_data["summarization_data"] if sent["label"] == 1])
        for clip_key, clip_data in summarization_data.items() if clip_data.get("is_summarization_sample")
    ]

    # Skip files with no valid target summaries
    if not target_summaries:
        return []

    # Generate model summaries and compute ROUGE scores
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to("cuda")
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Calculate ROUGE for each target summary
    rouge_scores = []
    for target_summary in target_summaries:
        scores = scorer.score(target_summary, generated_summary)
        rouge_scores.append(scores)

    return rouge_scores

# Evaluate all files and aggregate results
all_rouge_scores = []
test_files = glob(os.path.join(test_data_dir, '*.json'))

for test_file in test_files:
    file_scores = evaluate_json_file(test_file)
    all_rouge_scores.extend(file_scores)

# Compute average ROUGE scores
average_scores = {}
for rouge_type in ['rouge1', 'rouge2', 'rougeL']:
    average_scores[rouge_type] = {
        "precision": sum(score[rouge_type].precision for score in all_rouge_scores) / len(all_rouge_scores),
        "recall": sum(score[rouge_type].recall for score in all_rouge_scores) / len(all_rouge_scores),
        "fmeasure": sum(score[rouge_type].fmeasure for score in all_rouge_scores) / len(all_rouge_scores),
    }

print("Average ROUGE Scores:", average_scores)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]