In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset

model_name = "EleutherAI/gpt-neo-125M" 
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  

def preprocess_data(example):
    inputs = tokenizer(
        example["context"], truncation=True, padding="max_length", max_length=512
    )
    labels = tokenizer(
        example["output"], truncation=True, padding="max_length", max_length=512
    )
    
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],  
        "labels": labels["input_ids"],
    }

dataset = load_dataset("json", data_files="mcq_dataset.json")

processed_dataset = dataset.map(preprocess_data, batched=True)

train_test_split = processed_dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

training_args = TrainingArguments(
    output_dir="./results", 
    evaluation_strategy="epoch", 
    learning_rate=5e-5,
    per_device_train_batch_size=2,  
    per_device_eval_batch_size=2,
    num_train_epochs=3, 
    save_steps=10_000, 
    save_total_limit=2, 
    logging_dir="./logs",  
    fp16=True, 
)


model = AutoModelForCausalLM.from_pretrained(model_name)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset, 
)

trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Generating train split: 20 examples [00:00, 2499.29 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 467.01 examples/s]
                                              
 33%|███▎      | 9/27 [01:27<03:02, 10.13s/it]

{'eval_loss': 1.110277533531189, 'eval_runtime': 3.1347, 'eval_samples_per_second': 0.638, 'eval_steps_per_second': 0.319, 'epoch': 1.0}


                                               
 67%|██████▋   | 18/27 [02:51<01:19,  8.80s/it]

{'eval_loss': 0.9875649213790894, 'eval_runtime': 2.5548, 'eval_samples_per_second': 0.783, 'eval_steps_per_second': 0.391, 'epoch': 2.0}


100%|██████████| 27/27 [04:09<00:00,  8.19s/it]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


model_path = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token 


def generate_mcq(context: str):

    inputs = tokenizer(
        context,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    inputs['attention_mask'] = inputs['attention_mask'] if 'attention_mask' in inputs else torch.ones_like(inputs['input_ids'])
    
  
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"], 
            max_new_tokens=100,  
            num_return_sequences=1
        )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


context = "Cloud computing provides businesses with virtualized computing resources like storage and networking on a pay-as-you-go basis, allowing for scalable and flexible resource management."

generated_mcq = generate_mcq(context)
print(generated_mcq)
