In [1]:
!pip install transformers datasets torch
!pip install sentencepiece
!pip install tf-keras
!pip install accelerate>=0.26.0

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118


In [3]:
import json

# Load SQuAD data (you already have this in JSON format)
with open('squad_data.json', 'r') as f:
    squad_data = json.load(f)

# Prepare the dataset for question generation
qa_pairs = []
for article in squad_data['data']:
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            
            # Handle the case where 'answers' is a list
            # We will take the first answer from the list (if it exists)
            answer = qa['answers'][0]['text'] if qa['answers'] else ""
            
            # Format the input as the context and output as the question
            input_text = f"generate question: {context}"
            output_text = f"{question} [answer]: {answer}"
            
            qa_pairs.append({
                'input': input_text,
                'output': output_text
            })

# Now 'qa_pairs' contains the dataset for training the model


In [4]:
import pandas as pd  # Add this line to import pandas
from datasets import Dataset

df = pd.DataFrame(qa_pairs)
train_dataset = Dataset.from_pandas(df)

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small' )
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenization function for dataset
def tokenize_function(examples):
    # Tokenize the input (question) and target (answer) text
    inputs = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=512)
    targets = tokenizer(examples['output'], padding='max_length', truncation=True, max_length=512)

    # Set 'labels' as the tokenized answers (output)
    inputs['labels'] = targets['input_ids']
    return inputs

# Assuming you have your dataset `train_dataset`
train_dataset = train_dataset.map(tokenize_function, batched=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
# Split train_dataset into training and validation sets
train_test_split = train_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [None]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch", 
    learning_rate=5e-5,  # Increase the learning rate
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
)

# Use DataCollatorForSeq2Seq for padding and handling sequences of different lengths
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,  # Use the appropriate data collator
)

trainer.train()





Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the trained model
tokenizer = T5Tokenizer.from_pretrained('./trained_model')
model = T5ForConditionalGeneration.from_pretrained('./trained_model')

# Set the model to evaluation mode
model.eval()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Context for inference
context = "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."

# Tokenize the input (context)
inputs = tokenizer("generate question: " + context, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device

# Set parameters for diverse question generation
num_questions = 5  # Number of questions to generate
max_length = 50    # Maximum length of each generated question

# Approach 1: Using Sampling
print("Questions generated with sampling:")
output_samples = model.generate(
    inputs['input_ids'],
    do_sample=True,               # Enable sampling
    max_length=max_length,
    top_k=50,                     # Top-k sampling
    top_p=0.95,                   # Nucleus sampling
    temperature=0.7,              # Temperature for randomness control
    num_return_sequences=num_questions  # Generate multiple questions
)

# Decode and print generated questions
for i, output in enumerate(output_samples):
    question = tokenizer.decode(output, skip_special_tokens=True)
    print(f"Question {i + 1}: {question}")

# Approach 2: Using Beam Search for diverse beams
print("\nQuestions generated with beam search:")
output_beams = model.generate(
    inputs['input_ids'],
    max_length=max_length,
    num_beams=10,                 # Number of beams for searching
    num_return_sequences=num_questions,  # Number of questions to return
    no_repeat_ngram_size=2        # Avoid repetition
)

# Decode and print generated questions
for i, output in enumerate(output_beams):
    question = tokenizer.decode(output, skip_special_tokens=True)
    print(f"Question {i + 1}: {question}")
