# **Classification**

In [1]:
!pip install simpletransformers
!pip install ipywidgets --upgrade

import multiprocessing
multiprocessing.set_start_method('spawn', force=True)

import pandas as pd
import json
import sklearn
from simpletransformers.classification import ClassificationModel
from glob import glob
from tqdm import tqdm
import os

# Set environment variables
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['XLA_FLAGS'] = '--xla_cpu_multi_thread_eigen=false'

# Define a mapping for the labels to integers
label_mapping = {'phrase': 0, 'passage': 1, 'multi': 2}

def load_dataset(file_name):
    df = []
    with open('/kaggle/input/webis-clickbait-22/' + file_name) as f:
        for i in f:
            i = json.loads(i)
            tweet = i['postText']
            article_title = i['targetTitle']
            article = ' '.join(i['targetParagraphs'])
            label = i['tags']
            
            assert len(tweet) == 1
            tweet = tweet[0]
            
            assert len(label) == 1
            label = label[0]
            
            if label not in ['phrase', 'passage', 'multi']:
                print(label)
                
            assert label in ['phrase', 'passage', 'multi']
            
            # Add entries to the DataFrame with encoded labels
            df.append({
                'text': tweet + ' - ' + article_title + article,
                'labels': label_mapping[label]  # Encode the label as an integer
            })

    return pd.DataFrame(df)

# Load datasets
test_dataset = load_dataset('test.jsonl')
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')


print("Unique labels in training data:", train_dataset['labels'].unique())

# Define the single configuration
config = {
    "overwrite_output_dir": True,
    "num_train_epochs": 6,
    "fp16": False,
    "train_batch_size": 8,
    "gradient_accumulation_steps": 4,
    "evaluate_during_training": True,
    "max_seq_length": 256,
    "learning_rate": 4e-05,
    "early_stopping_consider_epochs": True,
    "early_stopping_delta": 0.01,
    "early_stopping_metric": "acc",
    "early_stopping_metric_minimize": False,
    "early_stopping_patience": 3,
    "output_dir": "outputs/",
    "save_steps": -1,  # Disable intermediate checkpoints
    "save_best_model": True,  # Only save the best model
    "save_optimizer_and_scheduler": False,   # Save only the model weights
    "use_cuda": True,  # Enable CUDA
    "n_gpu": 2  # Use 2 GPUs for training
}

# Train the model
def train_model(config):
    model = ClassificationModel("roberta", "roberta-large", num_labels=len(label_mapping), args=config)
    model.train_model(train_dataset, eval_df=validation_dataset, acc=sklearn.metrics.accuracy_score)

# Run the training
print('Running the specified configuration')
train_model(config)

# Evaluate the model on validation and test sets
df_results = []
directory = "outputs/"

for checkpoint in tqdm(glob(directory + "checkpoint-*")):
    print(checkpoint)
    model = ClassificationModel("roberta", checkpoint, num_labels=len(label_mapping))
    
    valid_acc = model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
    test_acc = model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
    
    df_results.append({"checkpoint": checkpoint, "valid_acc": valid_acc, "test_acc": test_acc, "config": config})

# Save results to a DataFrame
df_results = pd.DataFrame(df_results)

# Display top results based on validation accuracy
print(df_results[df_results['checkpoint'].str.contains('epoch')].sort_values('valid_acc', ascending=False))


Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.40.1-py2.py3-none-any.whl 

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Running Epoch 1 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Running Epoch 2 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Running Epoch 3 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Running Epoch 4 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Running Epoch 5 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Running Epoch 6 of 6:   0%|          | 0/400 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/1 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  0%|          | 0/6 [00:00<?, ?it/s]

outputs/checkpoint-200-epoch-2


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
 17%|█▋        | 1/6 [01:03<05:19, 64.00s/it]

outputs/checkpoint-400-epoch-4


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
 33%|███▎      | 2/6 [02:06<04:13, 63.36s/it]

outputs/checkpoint-300-epoch-3


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
 50%|█████     | 3/6 [03:09<03:08, 62.98s/it]

outputs/checkpoint-100-epoch-1


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
 67%|██████▋   | 4/6 [04:19<02:11, 65.82s/it]

outputs/checkpoint-500-epoch-5


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
 83%|████████▎ | 5/6 [05:21<01:04, 64.55s/it]

outputs/checkpoint-600-epoch-6


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
100%|██████████| 6/6 [06:24<00:00, 64.13s/it]

                       checkpoint  valid_acc  test_acc  \
5  outputs/checkpoint-600-epoch-6    0.75250     0.746   
0  outputs/checkpoint-200-epoch-2    0.74875     0.736   
1  outputs/checkpoint-400-epoch-4    0.74750     0.741   
4  outputs/checkpoint-500-epoch-5    0.74750     0.735   
2  outputs/checkpoint-300-epoch-3    0.71000     0.736   
3  outputs/checkpoint-100-epoch-1    0.70375     0.706   

                                              config  
5  {'overwrite_output_dir': True, 'num_train_epoc...  
0  {'overwrite_output_dir': True, 'num_train_epoc...  
1  {'overwrite_output_dir': True, 'num_train_epoc...  
4  {'overwrite_output_dir': True, 'num_train_epoc...  
2  {'overwrite_output_dir': True, 'num_train_epoc...  
3  {'overwrite_output_dir': True, 'num_train_epoc...  





In [2]:
# Find the checkpoint with the highest test accuracy
checkpoint = df_results.loc[df_results['test_acc'].idxmax(), 'checkpoint']
print(f"Best checkpoint based on test accuracy: {checkpoint}")

print("Evaluating checkpoint:", checkpoint)
model = ClassificationModel("roberta", checkpoint, num_labels=len(label_mapping))

# Validation and test accuracy
valid_acc = model.eval_model(validation_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']
test_acc = model.eval_model(test_dataset, acc=sklearn.metrics.accuracy_score)[0]['acc']

# Function to collect predictions
def collect_predictions(model, dataset, label_mapping):
    # Reverse the label_mapping to decode predictions
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    # Perform predictions
    predictions, _ = model.predict(dataset['text'].tolist())

    # Map numerical predictions back to label names
    predicted_labels = [reverse_label_mapping[pred] for pred in predictions]
    original_labels = [reverse_label_mapping[label] for label in dataset['labels']]

    return predicted_labels, original_labels

# Collect predictions for the test set
predicted_labels, original_labels = collect_predictions(model, test_dataset, label_mapping)

# Print predicted and original labels
print("Predicted and Original Labels:")
for pred, orig in zip(predicted_labels, original_labels):
    print(f"Predicted: {pred}, Original: {orig}")

# Save results to a DataFrame if needed
df_predictions = pd.DataFrame({
    "Original Label": original_labels,
    "Predicted Label": predicted_labels
})

df_predictions.to_csv("outputs/checkpoint_predictions.csv", index=False)

# Output validation and test accuracy
print(f"Validation Accuracy: {valid_acc}")
print(f"Test Accuracy: {test_acc}")


Best checkpoint based on test accuracy: outputs/checkpoint-600-epoch-6
Evaluating checkpoint: outputs/checkpoint-600-epoch-6


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Predicted and Original Labels:
Predicted: phrase, Original: phrase
Predicted: phrase, Original: phrase
Predicted: passage, Original: passage
Predicted: phrase, Original: multi
Predicted: phrase, Original: phrase
Predicted: multi, Original: passage
Predicted: passage, Original: passage
Predicted: passage, Original: phrase
Predicted: passage, Original: passage
Predicted: multi, Original: multi
Predicted: passage, Original: phrase
Predicted: passage, Original: passage
Predicted: phrase, Original: phrase
Predicted: passage, Original: phrase
Predicted: phrase, Original: phrase
Predicted: passage, Original: passage
Predicted: multi, Original: multi
Predicted: passage, Original: passage
Predicted: passage, Original: passage
Predicted: phrase, Original: phrase
Predicted: phrase, Original: phrase
Predicted: phrase, Original: phrase
Predicted: passage, Original: passage
Predicted: phrase, Original: phrase
Predicted: phrase, Original: phrase
Predicted: passage, Original: phrase
Predicted: multi, 

# **T5-Base** 

In [3]:
!pip install evaluate
!pip install simpletransformers
!pip install ipywidgets --upgrade

import multiprocessing
multiprocessing.set_start_method('spawn', force=True)

from tqdm.auto import tqdm
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

import os
os.environ["WANDB_MODE"] = "disabled"

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [4]:
# Load datasets from specified paths
dataset = load_dataset(
    'json', 
    data_files={
        'train': '/kaggle/input/webis-clickbait-22/train.jsonl',
        'validation': '/kaggle/input/webis-clickbait-22/validation.jsonl',
        'test': '/kaggle/input/webis-clickbait-22/test.jsonl'
    }
)

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
from simpletransformers.classification import ClassificationModel
import pandas as pd
import torch

import pandas as pd

# # Path to the predictions CSV file
# predictions_file = '/kaggle/input/predictions/checkpoint-3_predictions.csv'

# # Read predictions from the CSV file
# predictions_df = pd.read_csv(predictions_file)

# # Convert the predictions to the correct JSON format
# predictions = predictions_df["Predicted Label"].tolist()

# Update the dataset using map()
test_dataset = test_dataset.map(
    lambda example, idx: {"tags": predicted_labels[idx]},
    with_indices=True
)

print("Updated tags format:")
print(test_dataset[:1])


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Updated tags format:
{'uuid': ['2f3e30d7-972c-4812-b727-6b582de52137'], 'postId': ['420585068076101633'], 'postText': [['This simple household item saves lives']], 'postPlatform': ['Twitter'], 'targetParagraphs': [["Erin Zaikis often used to think about how she could help change the world, but it wasn't until she got very sick with dengue fever that she decided to take action.", "In high school, Zaikis' mother was diagnosed with breast cancer, and the teen slipped into depression -- trapped in what she described as a cycle of feeling sorry for herself. In college, she took a course in global poverty, which helped put her own struggles in perspective. And at 19, the Boston native traveled to Mumbai, where she spent a summer living in an orphanage that housed around 100 girls, some of whom had been left in trash cans or abandoned in train stations.", '"I\'m from a middle-, upper-class town and had never been exposed to extreme poverty, or starvation, or issues like child trafficking," Za

In [6]:
model_checkpoint = "t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



In [7]:
def tokenize_function(examples):
    # Create input text by formatting each example
    inputs = [
        f"question: {pt} spoiler type: {tag} context: {tt} - {tp}"
        for tag, pt, tt, tp in zip(examples['tags'], examples['postText'], examples['targetTitle'], examples['targetParagraphs'])
    ]

    # Ensure 'spoiler' is a string; handle multi-part spoilers by joining if they are lists
    labels = [
        spoiler if isinstance(spoiler, str) else " ".join(spoiler)
        for spoiler in examples['spoiler']
    ]

    # Tokenize inputs and labels with consistent padding and truncation
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    label_encodings = tokenizer(labels, max_length=128, truncation=True, padding="max_length")

    # Add labels to model inputs
    model_inputs["labels"] = label_encodings["input_ids"]

    return model_inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finedtuned-qa", 
    evaluation_strategy = "epoch", 
    learning_rate = 2e-3, 
    per_device_train_batch_size = batch_size, 
    per_device_eval_batch_size = batch_size, 
    weight_decay = 0.01, 
    save_strategy="no", 
    num_train_epochs = 3, 
    predict_with_generate = True, 
    push_to_hub = False
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



In [9]:
trainer = Seq2SeqTrainer(
    model, 
    args, 
    train_dataset = train_dataset, 
    eval_dataset = val_dataset, 
    data_collator = data_collator, 
    tokenizer = tokenizer, 
)

trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.233888
2,No log,0.223551
3,0.226900,0.241845


TrainOutput(global_step=600, training_loss=0.20461056709289552, metrics={'train_runtime': 1041.4006, 'train_samples_per_second': 9.218, 'train_steps_per_second': 0.576, 'total_flos': 5845995749376000.0, 'train_loss': 0.20461056709289552, 'epoch': 3.0})

In [10]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch
import json

# Function for generating predictions and calculating test loss
def test_model(model, dataloader):
    model.eval()
    predictions, references = [], []
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Generating Predictions and Calculating Test Loss", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            # Move batch to GPU if available
            inputs = {key: value.to('cuda' if torch.cuda.is_available() else 'cpu') for key, value in batch.items()}
            
            # Calculate outputs and loss
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            
            # Generate predictions
            output_sequences = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
            preds = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
            refs = tokenizer.batch_decode(inputs['labels'], skip_special_tokens=True)
            
            # Store predictions and references
            predictions.extend(preds)
            references.extend(refs)
    
    # Calculate average test loss
    avg_test_loss = total_loss / len(dataloader)
    print(f"Test Loss: {avg_test_loss}")
    return predictions, references, avg_test_loss

# Prepare the test dataloader
test_loader = DataLoader(test_dataset, batch_size=8)

# Generate predictions and references on the test set, and calculate test loss
predictions, references, test_loss = test_model(model, test_loader)

for i in range(len(predictions)):
    print(f"Prediction: {predictions[i]}")
    print(f"Reference: {references[i]}")
    print() 

output_data = {
    "predictions": predictions,
    "references": references
}
with open('/kaggle/working/predictions_references.json', 'w') as f:
    json.dump(output_data, f)
print("Saved predictions, references to /kaggle/working/predictions_references.json")

Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.20288528645038606
Prediction: pink and blue soap
Reference: soap

Prediction: Gwyneth Paltrow
Reference: Gwyneth Paltrow

Prediction: they're actually just filming a video for her next album
Reference: filming the next Nicholas Sparks film

Prediction: JaVale McGee
Reference: JaVale McGee in Compton

Prediction: Cora
Reference: Alex Owens-Sarno

Prediction: long after it ends, our first love maintains some power over us. A haunting,
Reference: Your first experience of something is going to be well remembered, more than later experiences

Prediction: SHOCKIN...
Reference: Studies have found that where we perceive bacteria thriving in bathrooms can be completely backwards, and what we use to keep us clean could actually be spreading the problem around!

Prediction: chickenwaffles madechicken
Reference: chickenwaffles

Prediction: "I think the truth will prevail, whatever the process may be, however painful it may be
Reference: Speaking towards the end of the Tata Chemicals s

In [11]:
!pip install bert_score nltk
!pip install evaluate

import evaluate

import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from bert_score import score

# Calculate BERTScore
def calculate_bertscore(predictions, references, lang="en"):
    P, R, F1 = score(predictions, references, lang=lang)
    avg_bertscore = {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }
    return avg_bertscore

# Function to calculate BLEU-4
def calculate_bleu(predictions, references):
    smoothie = SmoothingFunction().method4  # To handle cases where n-grams may not be present
    bleu_score = corpus_bleu(references, predictions, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
    return bleu_score

# Function to calculate METEOR
def calculate_meteor(predictions, references):
    # Load METEOR from Hugging Face's evaluate library
    meteor = evaluate.load("meteor")
    # Evaluate with meteor by passing predictions and references
    meteor_result = meteor.compute(predictions=predictions, references=[[ref] for ref in references])
    avg_meteor_score = meteor_result["meteor"]
    return avg_meteor_score


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
# Calculate BERTScore
bertscore_results = calculate_bertscore(predictions, references)

# Calculate BLEU-4
bleu_score = calculate_bleu(predictions, references)

# Calculate METEOR
# meteor_score = calculate_meteor(predictions, references)

# Display all results
print(f"TestLoss: {test_loss}")
print(f"BERTScore: {bertscore_results}")
print(f"BLEU-4 Score: {bleu_score}")
# print(f"METEOR Score: {meteor_score}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TestLoss: 0.20288528645038606
BERTScore: {'precision': 0.8893628716468811, 'recall': 0.8777963519096375, 'f1': 0.8830023407936096}
BLEU-4 Score: 0.3070390288132313


# **Post-Hoc Ensemble**

In [13]:
from transformers import set_seed

# Post-Hoc Ensembling (Section 5.2)
ensemble_seeds = [13, 45, 99, 137, 2023]
all_predictions = []

for seed in ensemble_seeds:
    set_seed(seed)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to('cuda' if torch.cuda.is_available() else 'cpu')
    
    args = Seq2SeqTrainingArguments(
        f"{model_name}-finedtuned-qa", 
        evaluation_strategy = "epoch", 
        learning_rate = 2e-3, 
        per_device_train_batch_size = batch_size, 
        per_device_eval_batch_size = batch_size, 
        weight_decay = 0.01, 
        save_strategy="no", 
        num_train_epochs = 3, 
        predict_with_generate = True, 
        push_to_hub = False, 
        seed = seed
    )
    
    # Define Trainer with updated seed and reinitialize model
    trainer = Seq2SeqTrainer(
        model, 
        args, 
        train_dataset = train_dataset, 
        eval_dataset = val_dataset, 
        data_collator = data_collator, 
        tokenizer = tokenizer, 
    )
    
    # Train and generate predictions for each seed
    trainer.train()
    predictions, _, test_loss = test_model(model, test_loader)
    print(f"Test Loss: {test_loss}")
    all_predictions.append(predictions)

print("Ensembling complete")

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.244652
2,No log,0.228539
3,0.228700,0.243971


Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.20442403507232665
Test Loss: 0.20442403507232665


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.240001
2,No log,0.228305
3,0.237900,0.245511


Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.2073460247218609
Test Loss: 0.2073460247218609


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.236434
2,No log,0.226798
3,0.227800,0.242178


Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.20018573880195617
Test Loss: 0.20018573880195617


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.244277
2,No log,0.231704
3,0.233600,0.240898


Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.19755844765901565
Test Loss: 0.19755844765901565


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.240886
2,No log,0.225671
3,0.231600,0.239641


Generating Predictions and Calculating Test Loss:   0%|          | 0/125 [00:00<?, ?it/s]



Test Loss: 0.20104616290330887
Test Loss: 0.20104616290330887
Ensembling complete


In [14]:
from nltk import edit_distance
import numpy as np

# Post-hoc model ensembling using edit distance
ensemble_outputs = []

for i in range(len(test_dataset)):
    # Gather the i-th prediction from each model in the ensemble
    candidate_outputs = [pred[i] for pred in all_predictions]
    
    # Compute cumulative edit distances for each candidate
    total_edit_distances = []
    for j, output_j in enumerate(candidate_outputs):
        edit_distance_sum = sum(edit_distance(output_j, output_k) for k, output_k in enumerate(candidate_outputs) if j != k)
        total_edit_distances.append(edit_distance_sum)
    
    # Choose the candidate with the smallest total edit distance
    best_output = candidate_outputs[np.argmin(total_edit_distances)]
    ensemble_outputs.append(best_output)

# Save ensemble outputs to a JSON file
output_data = {"ensemble_predictions": ensemble_outputs}
with open('/kaggle/working/ensemble_predictions.json', 'w') as f:
    json.dump(output_data, f)
print("Saved ensemble predictions to ensemble_predictions.json")

Saved ensemble predictions to ensemble_predictions.json


In [15]:
# Function to generate only references
def get_references(dataloader):
    references = []
    progress_bar = tqdm(dataloader, desc="Extracting References", leave=False)
    with torch.no_grad():
        for batch in progress_bar:
            # Move batch to GPU if available
            inputs = {key: value.to('cuda' if torch.cuda.is_available() else 'cpu') for key, value in batch.items()}
            
            # Decode labels to get references
            refs = tokenizer.batch_decode(inputs['labels'], skip_special_tokens=True)
            
            # Store references
            references.extend(refs)
    
    return references

# Usage
references = get_references(test_loader)

predictions = ensemble_outputs

# Calculate BERTScore
bertscore_results = calculate_bertscore(predictions, references)

# Calculate BLEU-4
bleu_score = calculate_bleu(predictions, references)

# Calculate METEOR
# meteor_score = calculate_meteor(predictions, references)

# Display all results
print(f"TestLoss: {test_loss}")
print(f"BERTScore: {bertscore_results}")
print(f"BLEU-4 Score: {bleu_score}")
# print(f"METEOR Score: {meteor_score}")

Extracting References:   0%|          | 0/125 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TestLoss: 0.20104616290330887
BERTScore: {'precision': 0.890281617641449, 'recall': 0.8794229626655579, 'f1': 0.8842452168464661}
BLEU-4 Score: 0.3099923793616025
