In [36]:
pip install torch torchvision torchaudio


Note: you may need to restart the kernel to use updated packages.


In [37]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

# Load dataset from a JSON file
dataset = load_dataset("json", data_files="improved_mcq_dataset.json")

# Preprocessing function to format the input and target
def preprocess_data(example):
    input_text = (
        f"context: {example['context']} "
        f"question: {example['question']} "
        f"options: {', '.join(example['options'])}"
    )
    target_text = example['answer']
    return {"input_text": input_text, "target_text": target_text}
    
# Apply the preprocessing function to the dataset
dataset = dataset.map(preprocess_data, remove_columns=["context", "question", "options", "answer"])

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the dataset
def tokenize_data(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=512, padding="max_length", truncation=True
    )
    labels = tokenizer(
        example["target_text"], max_length=128, padding="max_length", truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the data
dataset = dataset.map(tokenize_data, batched=True)

# Split the dataset into training and evaluation sets using train_test_split
dataset_split = dataset["train"].train_test_split(test_size=0.2)

# Access the training and evaluation datasets
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

# Print dataset sizes
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")


Map: 100%|██████████| 841/841 [00:00<00:00, 9647.19 examples/s]
Map: 100%|██████████| 841/841 [00:00<00:00, 1815.85 examples/s]

Train dataset size: 672
Eval dataset size: 169





In [38]:
!pip install accelerate>=0.26.0

In [39]:
pip install transformers[torch]


Note: you may need to restart the kernel to use updated packages.


In [40]:
pip install accelerate>=0.26.0


Note: you may need to restart the kernel to use updated packages.


In [41]:
from datasets import load_dataset

# Load dataset from the JSON file (improved_mcq_dataset.json)
dataset = load_dataset("json", data_files="improved_mcq_dataset.json")

# Print the dataset to verify it's loaded correctly
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'options', 'answer'],
        num_rows: 841
    })
})


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the pretrained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load dataset from JSON file
dataset = load_dataset("json", data_files="improved_mcq_dataset.json")

# If no train/test split exists, create one
dataset = dataset["train"].train_test_split(test_size=0.2)  # 80% for training, 20% for validation

# Preprocess the dataset
def preprocess_function(examples):
    # Ensure 'context' and 'question' exist in the dataset
    inputs = [f"context: {context} question: {question}" for context, question in zip(examples['context'], examples['question'])]
    
    # Output is the correct answer (as one of the options)
    targets = examples['answer']
    
    # Tokenize inputs and targets using T5 tokenizer
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Apply preprocessing to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for saving model checkpoints
    evaluation_strategy="epoch",  # Evaluate after every epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=8,  # Batch size per device during evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Strength of weight decay
    logging_dir="./logs",  # Directory to store logs
    logging_steps=10,  # Log every 10 steps
    save_steps=10_000,  # Save model every 10k steps
    save_total_limit=2,  # Only keep the 2 most recent checkpoints
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    train_dataset=tokenized_datasets["train"],  # Training dataset
    eval_dataset=tokenized_datasets["test"],  # Evaluation dataset (validation)
)

# Start fine-tuning
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Map: 100%|██████████| 672/672 [00:00<00:00, 2429.43 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 1931.48 examples/s]


In [None]:
# Load dataset from JSON file


In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

dataset = load_dataset("json", data_files="improved_mcq_dataset.json")

def preprocess_function(examples):
    inputs = [
        f"Context: {context} Question: {question} Options: {', '.join(options)}"
        for context, question, options in zip(examples['context'], examples['question'], examples['options'])
    ]
    
    targets = examples['answer']
    
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

dataset = dataset.map(preprocess_function, batched=True)

data = dataset["train"]  
data_dict = {
    "context": data["context"],
    "question": data["question"],
    "options": data["options"],
    "answer": data["answer"]
}

data_list = [{'context': c, 'question': q, 'options': o, 'answer': a} 
             for c, q, o, a in zip(data_dict["context"], data_dict["question"], data_dict["options"], data_dict["answer"])]

train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict({
    'context': [item['context'] for item in train_data],
    'question': [item['question'] for item in train_data],
    'options': [item['options'] for item in train_data],
    'answer': [item['answer'] for item in train_data]
})

test_dataset = Dataset.from_dict({
    'context': [item['context'] for item in test_data],
    'question': [item['question'] for item in test_data],
    'options': [item['options'] for item in test_data],
    'answer': [item['answer'] for item in test_data]
})

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")



train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",  # Output directory for saving model checkpoints
    evaluation_strategy="epoch",  # Evaluate after every epoch
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=8,  # Batch size per device during evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Strength of weight decay
    logging_dir="./logs",  # Directory to store logs
    logging_steps=10,  # Log every 10 steps
    save_steps=10_000,  # Save model every 10k steps
    save_total_limit=2,  # Only keep the 2 most recent checkpoints
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")




  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Generating train split: 613 examples [00:00, 24517.80 examples/s]
Map: 100%|██████████| 613/613 [00:00<00:00, 1919.14 examples/s]


Train dataset size: 490
Test dataset size: 123


Map: 100%|██████████| 490/490 [00:00<00:00, 1817.61 examples/s]
Map: 100%|██████████| 123/123 [00:00<00:00, 1845.97 examples/s]
  0%|          | 0/186 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  5%|▌         | 10/186 [01:18<24:49,  8.46s/it]

{'loss': 15.5458, 'grad_norm': 90.13682556152344, 'learning_rate': 1.89247311827957e-05, 'epoch': 0.16}


 11%|█         | 20/186 [02:58<27:20,  9.88s/it]

{'loss': 9.37, 'grad_norm': 80.87954711914062, 'learning_rate': 1.78494623655914e-05, 'epoch': 0.32}


 16%|█▌        | 30/186 [04:35<25:18,  9.73s/it]

{'loss': 5.6048, 'grad_norm': 72.12053680419922, 'learning_rate': 1.6774193548387098e-05, 'epoch': 0.48}


 22%|██▏       | 40/186 [06:12<23:51,  9.81s/it]

{'loss': 3.2055, 'grad_norm': 45.716583251953125, 'learning_rate': 1.5698924731182796e-05, 'epoch': 0.65}


 27%|██▋       | 50/186 [07:50<21:31,  9.50s/it]

{'loss': 2.0929, 'grad_norm': 31.496084213256836, 'learning_rate': 1.4623655913978497e-05, 'epoch': 0.81}


 32%|███▏      | 60/186 [09:10<16:36,  7.91s/it]

{'loss': 1.4973, 'grad_norm': 12.022537231445312, 'learning_rate': 1.3548387096774194e-05, 'epoch': 0.97}


                                                
 33%|███▎      | 62/186 [10:02<12:56,  6.27s/it]

{'eval_loss': 0.17746637761592865, 'eval_runtime': 41.5421, 'eval_samples_per_second': 2.961, 'eval_steps_per_second': 0.385, 'epoch': 1.0}


 38%|███▊      | 70/186 [11:13<18:27,  9.55s/it]

{'loss': 1.1678, 'grad_norm': 5.6366448402404785, 'learning_rate': 1.2473118279569894e-05, 'epoch': 1.13}


 43%|████▎     | 80/186 [12:35<14:41,  8.32s/it]

{'loss': 0.997, 'grad_norm': 3.985795021057129, 'learning_rate': 1.1397849462365593e-05, 'epoch': 1.29}


 48%|████▊     | 90/186 [14:03<13:25,  8.39s/it]

{'loss': 0.8925, 'grad_norm': 4.458528995513916, 'learning_rate': 1.0322580645161291e-05, 'epoch': 1.45}


 54%|█████▍    | 100/186 [15:24<11:38,  8.12s/it]

{'loss': 0.6992, 'grad_norm': 4.402638912200928, 'learning_rate': 9.24731182795699e-06, 'epoch': 1.61}


 59%|█████▉    | 110/186 [16:49<11:24,  9.01s/it]

{'loss': 0.5862, 'grad_norm': 4.469958305358887, 'learning_rate': 8.172043010752689e-06, 'epoch': 1.77}


 65%|██████▍   | 120/186 [18:22<10:12,  9.28s/it]

{'loss': 0.4839, 'grad_norm': 34.22441482543945, 'learning_rate': 7.096774193548388e-06, 'epoch': 1.94}


                                                 
 67%|██████▋   | 124/186 [19:33<07:26,  7.20s/it]

{'eval_loss': 0.18215292692184448, 'eval_runtime': 41.2974, 'eval_samples_per_second': 2.978, 'eval_steps_per_second': 0.387, 'epoch': 2.0}


 70%|██████▉   | 130/186 [20:29<10:19, 11.06s/it]

{'loss': 0.4767, 'grad_norm': 3.7115869522094727, 'learning_rate': 6.021505376344087e-06, 'epoch': 2.1}


 75%|███████▌  | 140/186 [22:12<07:54, 10.32s/it]

{'loss': 0.4045, 'grad_norm': 2.528980016708374, 'learning_rate': 4.946236559139785e-06, 'epoch': 2.26}


 81%|████████  | 150/186 [23:48<05:47,  9.65s/it]

{'loss': 0.3835, 'grad_norm': 1.982576847076416, 'learning_rate': 3.870967741935484e-06, 'epoch': 2.42}


 86%|████████▌ | 160/186 [25:12<03:35,  8.27s/it]

{'loss': 0.3607, 'grad_norm': 2.90108323097229, 'learning_rate': 2.7956989247311827e-06, 'epoch': 2.58}


 91%|█████████▏| 170/186 [26:56<02:40, 10.03s/it]

{'loss': 0.3507, 'grad_norm': 1.7168809175491333, 'learning_rate': 1.720430107526882e-06, 'epoch': 2.74}


 97%|█████████▋| 180/186 [28:26<00:56,  9.42s/it]

{'loss': 0.3461, 'grad_norm': 2.3854501247406006, 'learning_rate': 6.451612903225807e-07, 'epoch': 2.9}


                                                 
100%|██████████| 186/186 [29:52<00:00,  9.63s/it]


{'eval_loss': 0.18632686138153076, 'eval_runtime': 37.4234, 'eval_samples_per_second': 3.287, 'eval_steps_per_second': 0.428, 'epoch': 3.0}
{'train_runtime': 1792.0578, 'train_samples_per_second': 0.82, 'train_steps_per_second': 0.104, 'train_loss': 2.4022324495418097, 'epoch': 3.0}


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\spiece.model',
 './fine_tuned_model\\added_tokens.json')

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_model")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_model")

def generate_mcq(context):
    input_text = f"Context: {context} Generate a question and four answer options. Ensure one option is correct."

    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    
    output = model.generate(input_ids=inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_answer


# Example test
context = "Cloud computing allows users to access and store data over the internet. It provides on-demand availability of computing resources such as servers, storage, and databases."
generated_mcq = generate_mcq(context)

print(f"Generated MCQ: {generated_mcq}")


Generated MCQ: True
