In [1]:
!pip install transformers datasets matplotlib seaborn plotly nltk



In [2]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, T5Tokenizer, T5ForConditionalGeneration,
                          GPT2Tokenizer, GPT2LMHeadModel, DataCollatorWithPadding)
import torch

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    return text

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

def preprocess_example(example):
    example['cleaned_question'] = preprocess_text(clean_text(example['question']))
    example['cleaned_answer'] = preprocess_text(clean_text(example['answer']))
    example['labels'] = 0  # Dummy labels; adjust based on your task
    return example

In [5]:
# Load the dataset
dataset = load_dataset("toughdata/quora-question-answer-dataset")

# Inspect dataset splits
print("Available dataset splits:", dataset.keys())

# Print dataset structure for each split
for split in dataset.keys():
    print(f"\nStructure of '{split}' split:")
    print(dataset[split].features)
    print("\nSample entry from", split, "dataset:")
    print(dataset[split][0])

Downloading readme:   0%|          | 0.00/485 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56402 [00:00<?, ? examples/s]

Available dataset splits: dict_keys(['train'])

Structure of 'train' split:
{'question': Value(dtype='string', id=None), 'answer': Value(dtype='string', id=None)}

Sample entry from train dataset:
{'question': 'Why whenever I get in the shower my girlfriend want to join?', 'answer': 'Isn’t it awful? You would swear that there wasn’t enough hot water to go around!\n'}


In [6]:
def preprocess_dataset(dataset):
    # Access individual datasets
    if 'train' in dataset:
        train_dataset = dataset['train']
    else:
        train_dataset = dataset['validation']  # Adjust as needed based on available splits

    # Check if there is a separate test split
    if 'test' in dataset:
        test_dataset = dataset['test']
    else:
        # If no separate test split, use a portion of the train dataset for testing
        test_size = 0.1
        train_test_split = train_dataset.train_test_split(test_size=test_size)
        train_dataset = train_test_split['train']
        test_dataset = train_test_split['test']

    # Print the number of rows before preprocessing
    print(f"Number of rows in 'train' split before preprocessing: {len(train_dataset)}")
    print(f"Number of rows in 'test' split before preprocessing: {len(test_dataset)}")

    # Remove duplicates by converting to a pandas DataFrame and then back
    train_df = train_dataset.to_pandas()
    train_df = train_df.drop_duplicates(subset=['question', 'answer'])
    train_dataset = train_dataset.from_pandas(train_df)

    # Handle missing values
    train_dataset = train_dataset.filter(lambda x: x['question'] and x['answer'])
    test_dataset = test_dataset.filter(lambda x: x['question'] and x['answer'])

    # Apply preprocessing function
    train_dataset = train_dataset.map(preprocess_example)
    test_dataset = test_dataset.map(preprocess_example)

      # Print the number of rows after preprocessing
    print(f"Number of rows in 'train' split after preprocessing: {len(train_dataset)}")
    print(f"Number of rows in 'test' split after preprocessing: {len(test_dataset)}")

    return train_dataset, test_dataset

# Apply preprocessing to the dataset
preprocessed_train_dataset, preprocessed_test_dataset = preprocess_dataset(dataset)

Number of rows in 'train' split before preprocessing: 50761
Number of rows in 'test' split before preprocessing: 5641


Filter:   0%|          | 0/49727 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5641 [00:00<?, ? examples/s]

Map:   0%|          | 0/49727 [00:00<?, ? examples/s]

Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

Number of rows in 'train' split after preprocessing: 49727
Number of rows in 'test' split after preprocessing: 5641


In [7]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['cleaned_question'],
        examples['cleaned_answer'],
        padding='max_length',
        truncation=True,
        max_length=256,  # Reduce max_length to decrease memory usage
        return_tensors='pt'
    )

# Tokenize the datasets
tokenized_train_dataset = preprocessed_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = preprocessed_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/49727 [00:00<?, ? examples/s]

Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorWithPadding, EarlyStoppingCallback

# Use DataCollatorWithPadding to handle padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training arguments
training_args_bert = TrainingArguments(
    output_dir='./results_bert',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,  # Number of epochs
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    logging_steps=100,
    save_steps=None,
    load_best_model_at_end=True,
    resume_from_checkpoint=False,
    report_to=[],
)



In [16]:
# Initialize BERT model
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")

# Trainer setup
trainer_bert = Trainer(
    model=bert_model,
    args=training_args_bert,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
)

# Train and evaluate the model
trainer_bert.train()
eval_results = trainer_bert.evaluate()

# Print evaluation results
print("Evaluation results:", eval_results)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0,5e-06
2,0.0,2e-06


Epoch,Training Loss,Validation Loss
1,0.0,5e-06
2,0.0,2e-06
3,0.0,1e-06
4,0.0,1e-06


Evaluation results: {'eval_loss': 9.546253068037913e-07, 'eval_runtime': 21.511, 'eval_samples_per_second': 262.237, 'eval_steps_per_second': 16.41, 'epoch': 4.0}


T5

In [17]:
#Initialize T5 tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Tokenization function for T5
def t5_tokenize_function(examples):
    inputs = [f"question: {q} context: {a}" for q, a in zip(examples['cleaned_question'], examples['cleaned_answer'])]
    targets = examples['cleaned_answer']

    model_inputs = t5_tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = t5_tokenizer(targets, max_length=512, padding="max_length", truncation=True).input_ids

    # Replace padding token id's in labels with -100 for loss calculation
    labels = [[-100 if token == t5_tokenizer.pad_token_id else token for token in label] for label in labels]

    model_inputs['labels'] = labels
    return model_inputs

# Tokenize the datasets for T5
tokenized_t5_train_dataset = preprocessed_train_dataset.map(t5_tokenize_function, batched=True)
tokenized_t5_test_dataset = preprocessed_test_dataset.map(t5_tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/49727 [00:00<?, ? examples/s]

Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

In [21]:
# Before tokenization
print(f"Original training dataset size: {len(preprocessed_train_dataset)}")
print(f"Original test dataset size: {len(preprocessed_test_dataset)}")


print(f"Tokenized training dataset size: {len(tokenized_t5_train_dataset)}")
print(f"Tokenized test dataset size: {len(tokenized_t5_test_dataset)}")

Original training dataset size: 49727
Original test dataset size: 5641
Tokenized training dataset size: 49727
Tokenized test dataset size: 5641


In [28]:
from transformers import DataCollatorForSeq2Seq, get_linear_schedule_with_warmup, EarlyStoppingCallback
from torch.optim import AdamW

# Training arguments for T5
training_args_t5 = TrainingArguments(
    output_dir='./results_t5',
    eval_strategy="epoch",
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce batch size to fit into RAM
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=4,  # Accumulate gradients over multiple steps
    logging_steps=100,  # Reduce logging frequency
    save_total_limit=2,  # Limit the total number of checkpoints
    load_best_model_at_end=True,  # Load the best model at the end
    resume_from_checkpoint=True,  # Resume from the last checkpoint
)

# Initialize T5 model
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Optimizer and scheduler setup
optimizer = AdamW(t5_model.parameters(), lr=2e-5)
num_training_steps = len(tokenized_t5_train_dataset) // (training_args_t5.per_device_train_batch_size * training_args_t5.gradient_accumulation_steps) * training_args_t5.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Initialize Trainer
trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=tokenized_t5_train_dataset,
    eval_dataset=tokenized_t5_test_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train and evaluate the model
trainer_t5.train()
eval_results_t5 = trainer_t5.evaluate()

# Print evaluation results
print("Evaluation results for T5:", eval_results_t5)

Epoch,Training Loss,Validation Loss
1,0.101,0.03587
2,0.0701,0.029657
3,0.0688,0.027546
4,0.0591,0.026875


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Evaluation results for T5: {'eval_loss': 0.026874614879488945, 'eval_runtime': 96.483, 'eval_samples_per_second': 58.466, 'eval_steps_per_second': 7.317, 'epoch': 4.0}


GPT2

In [31]:
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

# Tokenization function for GPT-2
def gpt_tokenize_function(examples):
    # Prepare inputs and labels
    inputs = [f"Question: {q} Answer: {a}" for q, a in zip(examples['cleaned_question'], examples['cleaned_answer'])]

    # Tokenize inputs
    model_inputs = gpt_tokenizer(inputs, max_length=512, padding="max_length", truncation=True)

    # Set labels to be the same as input_ids
    model_inputs['labels'] = model_inputs['input_ids'].copy()

    return model_inputs

# Tokenize the datasets for GPT-2
tokenized_gpt_train_dataset = preprocessed_train_dataset.map(gpt_tokenize_function, batched=True)
tokenized_gpt_test_dataset = preprocessed_test_dataset.map(gpt_tokenize_function, batched=True)


Map:   0%|          | 0/49727 [00:00<?, ? examples/s]

Map:   0%|          | 0/5641 [00:00<?, ? examples/s]

In [63]:
from transformers import DataCollatorForLanguageModeling

# Initialize GPT-2 model
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Use DataCollatorForLanguageModeling to handle padding dynamically for GPT-2
data_collator = DataCollatorForLanguageModeling(tokenizer=gpt_tokenizer, mlm=False)

# Training arguments for GPT-2
training_args_gpt = TrainingArguments(
    output_dir='./results_gpt',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    logging_steps=100,
    save_steps=None,
    load_best_model_at_end=True,
    resume_from_checkpoint=True,  # Resume from the last checkpoint if available
)

# Optimizer and scheduler setup
optimizer = AdamW(gpt_model.parameters(), lr=2e-5)
num_training_steps = len(tokenized_gpt_train_dataset) // (training_args_gpt.per_device_train_batch_size * training_args_gpt.gradient_accumulation_steps) * training_args_gpt.num_train_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_training_steps // 10,
    num_training_steps=num_training_steps,
)


# Trainer setup for GPT-2
trainer_gpt = Trainer(
    model=gpt_model,
    args=training_args_gpt,
    train_dataset=tokenized_gpt_train_dataset,
    eval_dataset=tokenized_gpt_test_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Train and evaluate the model
trainer_gpt.train()
eval_results = trainer_gpt.evaluate()

# Print evaluation results
print("Evaluation results:", eval_results)

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,5.2594,5.052281
2,5.0164,4.835311
3,4.9646,4.789251


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Evaluation results: {'eval_loss': 4.789251327514648, 'eval_runtime': 114.8944, 'eval_samples_per_second': 49.097, 'eval_steps_per_second': 12.281, 'epoch': 3.0}


In [43]:
pip install rouge_score datasets

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e5381d819caa6ec1586c48c53a97cdbaa2fc51bcc18b67f02610f813b4571366
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [53]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [60]:
from datasets import load_metric

# Initialize metrics
rouge_metric = load_metric("rouge")
bleu_metric = load_metric("bleu")
f1_metric = load_metric("f1")

# Function to decode model outputs and labels
def decode_predictions(predictions, labels, tokenizer):
    # For BERT and T5, we need to use the `argmax` approach to get token IDs
    pred_texts = tokenizer.batch_decode(predictions.argmax(axis=-1), skip_special_tokens=True)
    ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

    return pred_texts, ref_texts

# Function to evaluate a model using ROUGE, BLEU, and F1
def evaluate_model(predictions, references):
    # Convert predictions and references to list of strings
    predictions = list(map(str.strip, predictions))
    references = list(map(str.strip, references))

    # Compute ROUGE
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)

    # Compute BLEU
    bleu_results = bleu_metric.compute(predictions=[predictions], references=[references])

    # Compute F1-score
    f1_results = f1_metric.compute(predictions=predictions, references=references)

    return {
        'ROUGE': rouge_results,
        'BLEU': bleu_results,
        'F1': f1_results
    }

# Function to get predictions and references from Trainer
def get_predictions_and_references(trainer, dataset, tokenizer):
    predictions = trainer.predict(dataset).predictions
    labels = trainer.predict(dataset).label_ids

    # Ensure predictions and labels are integers
    predictions = np.argmax(predictions, axis=-1)  # Convert logits to token IDs

    # Decode predictions and labels
    pred_texts, ref_texts = decode_predictions(predictions, labels, tokenizer)

    return pred_texts, ref_texts

# Example usage for BERT
predictions_bert, references_bert = get_predictions_and_references(trainer_bert, tokenized_test_dataset, tokenizer)
results_bert = evaluate_model(predictions_bert, references_bert)
print("BERT Evaluation Results:", results_bert)

# Example usage for T5
predictions_t5, references_t5 = get_predictions_and_references(trainer_t5, tokenized_t5_test_dataset, t5_tokenizer)
results_t5 = evaluate_model(predictions_t5, references_t5)
print("T5 Evaluation Results:", results_t5)

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

The repository for bleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

KeyboardInterrupt: Interrupted by user