In [1]:
!pip install transformers datasets torch sentencepiece

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [2]:
import pandas as pd
import random
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments

In [None]:
# Load the XLSX file
df = pd.read_excel("Final.xlsx")

# Save as CSV
df.to_csv("file.csv", index=False)

In [8]:
# Load dataset (replace 'your_dataset.csv' with the actual filename)
df = pd.read_csv('file.csv')
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Display first few rows
print(df.head())

                                             Problem  \
0  a student is ranked 12 th from right and 6 th ...   
1  in a class , 7 students like to play basketbal...   
2  vijay sells a cupboard at 14 % below cost pric...   
3  sum of the squares of 3 no . ' s is 267 and th...   
4  a man can row downstream at the rate of 26 kmp...   

                                           Rationale  \
0  "from right 12 , from left 6 total = 12 + 6 - ...   
1  "draw a venn diagram yourself ! b + c - bc = n...   
2  "explanation : cost price = 2086 / ( 0.14 + 0....   
3  "( a + b + c ) 2 = a 2 + b 2 + c 2 + 2 ( ab + ...   
4  "rate of still water = 1 / 2 ( down stream + u...   

                                             options correct  \
0         a ) 17 , b ) 19 , c ) 20 , d ) 21 , e ) 22       b   
1          a ) 12 , b ) 15 , c ) 16 , d ) 9 , e ) 22       d   
2  a ) 7458 , b ) 7456 , c ) 7450 , d ) 7454 , e ...       c   
3         a ) 20 , b ) 24 , c ) 26 , d ) 23 , e ) 30       d   
4   a 

In [9]:
def format_question(row):
    question = row["Problem"].strip()

    # Parse options into (A)-(D)
    option_parts = row["options"].split(",")
    options_dict = {}
    for part in option_parts:
        if ")" in part:
            key, val = part.strip().split(")", 1)
            options_dict[key.strip().lower()] = val.strip()

    options = f"(A) {options_dict.get('a', '')} (B) {options_dict.get('b', '')} (C) {options_dict.get('c', '')} (D) {options_dict.get('d', '')}"

    correct_option = row["correct"].lower().strip()
    answer = f"{correct_option.upper()}) {options_dict.get(correct_option, '')}"

    difficulty = row["difficulty"].capitalize() if pd.notna(row["difficulty"]) else "Medium"
    topic = row["category"].capitalize() if pd.notna(row["category"]) else "General"

    input_text = f"Generate a {difficulty}-level MCQ on {topic} with 4 choices. Format: Question? (A) Option1 (B) Option2 (C) Option3 (D) Option4 Correct Answer: Answer."
    target_text = f"{question}? {options} Correct Answer: {answer}"

    return input_text, target_text


# Apply function to all rows
data = df.apply(format_question, axis=1)

# Create final lists
input_texts, target_texts = zip(*data)


In [10]:
# Load tokenizer & model
model_name = "google/flan-t5-small"  # Or "flan-t5-base" / "flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [11]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    list(input_texts), list(target_texts), test_size=0.2, random_state=42
)
# Tokenize inputs and targets
train_encodings = tokenizer(train_inputs, padding=True, truncation=True, max_length=128, return_tensors="pt", return_attention_mask=True)
target_encodings = tokenizer(train_targets, padding=True, truncation=True, max_length=128, return_tensors="pt", return_attention_mask=True)

val_encodings = tokenizer(val_inputs, padding=True, truncation=True, max_length=128, return_tensors="pt", return_attention_mask=True)
val_targets_encodings = tokenizer(val_targets, padding=True, truncation=True, max_length=128, return_tensors="pt", return_attention_mask=True)

# Prepare PyTorch dataset
class MCQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels["input_ids"]

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# Create dataset
train_dataset = MCQDataset(train_encodings, target_encodings)
val_dataset = MCQDataset(val_encodings, val_targets_encodings)


In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-mcq-model",
    eval_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=5,
    num_train_epochs=10,  # Adjust for more training
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"  # Avoids logging errors in Colab
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # ✅ Added evaluation dataset
)

trainer.train()


Step,Training Loss,Validation Loss
100,1.0794,1.02098
200,1.1209,1.01611
300,1.1175,1.014721
400,1.1642,1.013919
500,1.1152,1.011586
600,1.1081,1.009301
700,1.055,1.0078
800,1.1248,1.006528
900,0.9907,1.004393
1000,0.9916,1.004264


Step,Training Loss,Validation Loss
100,1.0794,1.02098
200,1.1209,1.01611
300,1.1175,1.014721
400,1.1642,1.013919
500,1.1152,1.011586
600,1.1081,1.009301
700,1.055,1.0078
800,1.1248,1.006528
900,0.9907,1.004393
1000,0.9916,1.004264


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=20000, training_loss=0.9865697365760803, metrics={'train_runtime': 5572.1349, 'train_samples_per_second': 14.357, 'train_steps_per_second': 3.589, 'total_flos': 1365133393920000.0, 'train_loss': 0.9865697365760803, 'epoch': 10.0})

In [14]:
# Save model & tokenizer
model.save_pretrained("./t5-mcq-model")
tokenizer.save_pretrained("./t5-mcq-model")

('./t5-mcq-model/tokenizer_config.json',
 './t5-mcq-model/special_tokens_map.json',
 './t5-mcq-model/spiece.model',
 './t5-mcq-model/added_tokens.json')

In [15]:
model_path = "./t5-mcq-model"

# Load model and tokenizer
try:
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    print("✅ Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")

✅ Model and tokenizer loaded successfully!


In [21]:
# Function to generate a new question
def generate_question(topic, difficulty="Medium", attempt=1, max_attempts=3):
    input_text = f"Generate a {difficulty}-level MCQ on {topic} with 4 choices. Format: Question (A) Option1 (B) Option2 (C) Option3 (D) Option4 Correct Answer: Answer."

    # Tokenize input correctly
    input_ids = tokenizer(input_text, return_tensors="pt", add_special_tokens=True)["input_ids"]

    # Debugging tokenization output
    print(f"🔹 Decoded Input: {tokenizer.decode(input_ids[0])}")

    # Generate output
    output = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,       # Reduce repetition but still optimize quality
        repetition_penalty=1.5,  # Penalize repetitive options
        temperature=0.7,    # Adds randomness (increase if still repetitive)
        top_k=30,           # Consider top 50 tokens instead of always picking highest probability
        top_p=0.85,         # Nucleus sampling for diversity
        early_stopping=True
    )

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    return decoded_output  # Directly return the model output without checking

# Test with a sample topic
print(generate_question("gain", difficulty="medium"))

🔹 Decoded Input: Generate a medium-level MCQ on gain with 4 choices. Format: Question (A) Option1 (B) Option2 (C) Option3 (D) Option4 Correct Answer: Answer.</s>
a sum of money at simple interest amounts to rs . 5000 in 3 years and to rs . 5000 in 4 years . the sum is :? (A) rs . 8000 (B) rs . 8000 (C) rs . 8000 (D) rs . 8000 Correct Answer: C) rs . 8000
