In [2]:
# ⚙️ STEP 1: Setup
!pip install transformers datasets --quiet

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
from google.colab import drive

# 🔒 Mount Google Drive to save model later
drive.mount('/content/drive')
SAVE_DIR = "/content/drive/MyDrive/flan-t5-mcq-model"

# 📥 STEP 2: Load your data
file_path = "/content/drive/MyDrive/MinorProject2/train.csv"  # change this!
df = pd.read_csv(file_path, sep=",")  # Use sep="," if normal CSV
print(f"✅ Loaded {len(df)} rows")
print(df.head())
df = df.groupby("difficulty", group_keys=False).apply(lambda x: x.sample(min(len(x), 500))).reset_index(drop=True)

# 🧪 Debug mode: only use 500 rows to test
'''DEBUG = True
if DEBUG:
    df = df.sample(n=500, random_state=42).reset_index(drop=True)
    print("🧪 Running in DEBUG mode with 500 rows")
'''
print(df.columns.tolist())


# ✏️ STEP 3: Format Data
import pandas as pd

def format_question(row):
    # Ensure the necessary keys are present
    required_keys = ["Problem", "options", "correct", "difficulty", "category"]
    for key in required_keys:
        if key not in row:
            raise ValueError(f"Missing key: {key}")

    question = row["Problem"].strip()

    # Parse options into (A)-(D)
    option_parts = row["options"].split(",")
    options_dict = {}
    for part in option_parts:
        if ")" in part:
            key, val = part.strip().split(")", 1)
            options_dict[key.strip().lower()] = val.strip()

    # Dynamically create options string
    options = " ".join([f"({chr(65 + i)}) {options_dict.get(chr(97 + i), '')}" for i in range(len(options_dict))])

    correct_option = row["correct"].lower().strip()
    answer = f"{correct_option.upper()}) {options_dict.get(correct_option, '')}"

    difficulty = row["difficulty"].capitalize() if pd.notna(row["difficulty"]) else "Medium"
    topic = row["category"].capitalize() if pd.notna(row["category"]) else "General"

    input_text = f"Generate a {difficulty}-level MCQ on {topic} with 4 choices. Format: Question? (A) Option1 (B) Option2 (C) Option3 (D) Option4 Correct Answer: Answer."
    target_text = f"{question}? {options} Correct Answer: {answer}"

    return input_text, target_text

data = df.apply(format_question, axis=1)
input_texts, target_texts = zip(*data)

# ✂️ STEP 4: Split and Tokenize
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    list(input_texts), list(target_texts), test_size=0.2, random_state=42
)

model_name = "google/flan-t5-base"
'''tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
'''
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


train_encodings = tokenizer(train_inputs, padding=True, truncation=True, max_length=256, return_tensors="pt")
target_encodings = tokenizer(train_targets, padding=True, truncation=True, max_length=256, return_tensors="pt")

val_encodings = tokenizer(val_inputs, padding=True, truncation=True, max_length=256, return_tensors="pt")
val_target_encodings = tokenizer(val_targets, padding=True, truncation=True, max_length=256, return_tensors="pt")


# 📦 Dataset Class
class MCQDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels["input_ids"]

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = MCQDataset(train_encodings, target_encodings)
val_dataset = MCQDataset(val_encodings, val_target_encodings)

# 🛠️ STEP 5: Training



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

  df = df.groupby("difficulty", group_keys=False).apply(lambda x: x.sample(min(len(x), 500))).reset_index(drop=True)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
training_args = TrainingArguments(
    output_dir="/content/flan-t5-mcq-temp",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    logging_dir='./logs',
    save_total_limit=5,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

# 💾 STEP 6: Save model to Drive
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"✅ Model saved to {SAVE_DIR}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.920371
2,1.909000,0.871696
3,1.909000,0.849418
4,0.709400,0.835828
5,0.666400,0.827017
6,0.666400,0.821213
7,0.630200,0.816054
8,0.630200,0.813596
9,0.616200,0.811598
10,0.603700,0.811798


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


✅ Model saved to /content/drive/MyDrive/flan-t5-mcq-model


In [6]:
def generate_mcq(topic="gain", difficulty="hard"):
    #prompt = f"Generate a {difficulty}-level MCQ on {topic} with 4 choices. Format: Question? (A) Option1 (B) Option2 (C) Option3 (D) Option4 Correct Answer: Answer."
    prompt = (
        "Example:\n"
        "Generate a Hard-level MCQ on Physics with 4 choices. Format: Question? (A)... (B)... (C)... (D)... Correct Answer: ...\n"
        "Question: What is the SI unit of force? (A) Newton (B) Joule (C) Pascal (D) Watt Correct Answer: A) Newton\n"
        "\nNow:\n"
        f"Generate a {difficulty}-level MCQ on {topic} with 4 choices. Format: ..."
    )
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        do_sample=True,
        temperature=0.5,
        top_p=0.9,
        repetition_penalty=1.5,
        max_length=256,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# 🧪 Try a test generation:
print(generate_mcq("general", "easy"))


a train is running at an average speed of 280 km / hr . the speed of the train is :? (A) 3 kmph (B) 2 kmph (C) 4 kmph (D) 5 kmph (E) 6 kmph Correct Answer: D) 5 kmph
