In [1]:
from typing import List
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    PreTrainedTokenizerBase,
)
import torch
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
print(torch.cuda.is_available())

False


In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
def load_model_and_tokenizer(model_name="gpt2"):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        )
        model.resize_token_embeddings(len(tokenizer))
        model.to(device)
        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Error loading model and tokenizer: {e}")

In [4]:
def build_prompt(question: str, options: str, correct_answer: str, explanation: str) -> str:
    try:
        return f"""
        Original Question: {question}
        Options: {options}
        Correct Answer: {correct_answer}
        Explanation: {explanation}

        Generate 4 distinct questions based on the original question. For each question:
        - Provide 4 options (one correct, three incorrect).
        - Clearly label the correct answer and provide an explanation.
        Output format:
        1. Question: ...
        Options: a) ... b) ... c) ... d) ...
        Correct Answer: ...
        Explanation: ...
        """
    except Exception as e:
        raise ValueError(f"Error building prompt: {e}")

In [5]:
def generate_distinct_questions(model, tokenizer, prompt: str) -> List[str]:
    try:
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                attention_mask=inputs.get("attention_mask"),
                max_length=512,
                num_return_sequences=4,
                temperature=0.7,
                top_k=50,
                top_p=0.95,
                do_sample=True,
                pad_token_id=tokenizer.pad_token_id,
            )
        return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    except Exception as e:
        raise RuntimeError(f"Error generating distinct questions: {e}")


In [6]:
def parse_generated_output(generated_text: str) -> List[dict]:
    try:
        questions = []
        blocks = generated_text.split("1. Question:")[1:]
        for block in blocks:
            lines = block.strip().split("\n")
            question = lines[0].replace("Question:", "").strip()
            options_line = next((line for line in lines if line.startswith("Options:")), None)
            correct_answer_line = next((line for line in lines if line.startswith("Correct Answer:")), None)
            explanation_line = next((line for line in lines if line.startswith("Explanation:")), None)

            if options_line and correct_answer_line and explanation_line:
                options = options_line.replace("Options:", "").strip()
                correct_answer = correct_answer_line.replace("Correct Answer:", "").strip()
                explanation = explanation_line.replace("Explanation:", "").strip()

                questions.append({
                    "Generated Question": question,
                    "Options": options,
                    "Correct Answer": correct_answer,
                    "Explanation": explanation,
                })
        return questions
    except Exception as e:
        raise RuntimeError(f"Error parsing generated output: {e}")

In [7]:
def expand_questions(input_data: pd.DataFrame, model, tokenizer) -> pd.DataFrame:
    try:
        required_columns = ["Degree", "Role", "Section", "Proficiency Level", "Question", "Options", "Correct Answer", "Explanation"]
        for column in required_columns:
            if column not in input_data.columns:
                raise ValueError(f"Missing required column: {column}")

        expanded_questions = []
        for _, row in input_data.iterrows():
            try:
                prompt = build_prompt(
                    row["Question"], row["Options"], row["Correct Answer"], row["Explanation"]
                )
                generated_texts = generate_distinct_questions(model, tokenizer, prompt)
                for generated_text in generated_texts:
                    parsed_questions = parse_generated_output(generated_text)
                    for parsed_question in parsed_questions:
                        expanded_questions.append({
                            "Degree": row["Degree"],
                            "Role": row["Role"],
                            "Section": row["Section"],
                            "Proficiency Level": row["Proficiency Level"],
                            **parsed_question,
                        })
            except Exception as e:
                print(f"Error processing row {row.to_dict()}: {e}")

        return pd.DataFrame(expanded_questions)
    except Exception as e:
        raise RuntimeError(f"Error expanding questions: {e}")


In [8]:
def preprocess_dataset(dataset: Dataset, tokenizer, max_length: int = 512):
    try:
        def tokenize_function(examples):
            inputs = tokenizer(
                examples["Question"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
            )
            targets = tokenizer(
                examples["Correct Answer"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
            )
            inputs["labels"] = targets["input_ids"]
            return inputs

        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        return tokenized_dataset.train_test_split(test_size=0.2)
    except Exception as e:
        raise RuntimeError(f"Error preprocessing dataset: {e}")

In [9]:
def fine_tune_model(input_data: pd.DataFrame, tokenizer, model, output_dir: str):
      try:
          dataset = Dataset.from_pandas(input_data)
          tokenized_dataset = preprocess_dataset(dataset, tokenizer)
          data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  
          training_args = TrainingArguments(
              output_dir=output_dir,
              num_train_epochs=3,
              per_device_train_batch_size=8,
              per_device_eval_batch_size=8,
              warmup_steps=500,
              weight_decay=0.01,
              logging_dir="./logs",
              logging_steps=10,
              evaluation_strategy="epoch",
              fp16=torch.cuda.is_available(),
          )
  
          trainer = Trainer(
              model=model,
              train_dataset=tokenized_dataset["train"],
              eval_dataset=tokenized_dataset["test"],
              tokenizer=tokenizer,
              data_collator=data_collator,
              args=training_args,
          )
  
          trainer.train()
          trainer.save_model(output_dir)
          print(f"Model fine-tuned and saved to {output_dir}")
      except Exception as e:
          raise RuntimeError(f"Error fine-tuning model: {e}")

In [10]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

file_path = "../placement-questions-excel.csv"
input_data = pd.read_csv(file_path, encoding="ISO-8859-1")
input_data.head()

Unnamed: 0,Degree,Role,Section,Proficiency Level,Question,Options,Correct Answer,Explanation
0,B.Tech in Computer Science,Data Analyst,Computational Skills,Beginner,What is the primary purpose of the pandas grou...,"['To sort data', 'To split data into groups', ...",To split data into groups,The groupby() function splits the data into gr...
1,B.Tech in Computer Science,Data Analyst,Core Programming,Medium,Which time complexity represents binary search?,"['O(n)', 'O(log n)', 'O(n log n)', 'O(1)']",O(log n),Binary search repeatedly divides the search sp...
2,B.Tech in Computer Science,Data Analyst,Data Analysis,Advanced,In a dataset with outliers which visualization...,"['Simple line plot', 'Box plot with whiskers',...",Box plot with whiskers,"Box plots show median, quartiles, and outliers..."
3,B.Sc. in Mathematics,Risk Analyst,Core Mathematical Subjects,Beginner,What is the variance of a constant?,"['1', 'The constant value', '0', 'Undefined']",0,The variance measures spread around the mean. ...
4,B.Sc. in Mathematics,Risk Analyst,Applied Mathematics,Medium,In Value at Risk (VaR) calculation what confid...,"['90%', '95%', '99%', '99.9%']",99%,99% is the standard confidence level for VaR i...


In [11]:
model_name = "gpt2"
model, tokenizer = load_model_and_tokenizer(model_name)
model

Using pad_token, but it is not set yet.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
tokenizer

PreTrainedTokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'})

In [13]:
fine_tuned_model_dir = "fine_tuned_gpt2_model"
tuned_model=fine_tune_model(input_data, tokenizer, model, output_dir=fine_tuned_model_dir)
tuned_model

Map: 100%|██████████| 45/45 [00:00<00:00, 1635.74 examples/s]
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer. If Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 36
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15
  Number of trainable parameters = 124439808
  0%|          | 0/15 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padde

{'eval_loss': 10.257019996643066, 'eval_runtime': 5.4028, 'eval_samples_per_second': 1.666, 'eval_steps_per_second': 0.37, 'epoch': 1.0}


 67%|██████▋   | 10/15 [04:05<01:47, 21.40s/it]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer. If Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9
  Batch size = 8


{'loss': 9.6686, 'learning_rate': 1.0000000000000002e-06, 'epoch': 2.0}


                                               
 67%|██████▋   | 10/15 [04:10<01:47, 21.40s/it]

{'eval_loss': 9.881877899169922, 'eval_runtime': 5.3767, 'eval_samples_per_second': 1.674, 'eval_steps_per_second': 0.372, 'epoch': 2.0}


100%|██████████| 15/15 [05:46<00:00, 18.17s/it]The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer. If Role, Options, Degree, Question, Section, Explanation, Proficiency Level, Correct Answer are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9
  Batch size = 8
                                               
100%|██████████| 15/15 [05:51<00:00, 18.17s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 15/15 [05:52<00:00, 23.47s/it]
Saving model checkpoint to fine_tuned_gpt2_model
Configuration saved in fine_tuned_gpt2_model\config.json


{'eval_loss': 9.235095977783203, 'eval_runtime': 5.4073, 'eval_samples_per_second': 1.664, 'eval_steps_per_second': 0.37, 'epoch': 3.0}
{'train_runtime': 352.1129, 'train_samples_per_second': 0.307, 'train_steps_per_second': 0.043, 'train_loss': 9.48508809407552, 'epoch': 3.0}


Model weights saved in fine_tuned_gpt2_model\pytorch_model.bin
tokenizer config file saved in fine_tuned_gpt2_model\tokenizer_config.json
Special tokens file saved in fine_tuned_gpt2_model\special_tokens_map.json


Model fine-tuned and saved to fine_tuned_gpt2_model


In [14]:
expanded_questions = expand_questions(input_data, model, tokenizer)
expanded_questions

<function __main__.expand_questions(input_data: pandas.core.frame.DataFrame, model, tokenizer) -> pandas.core.frame.DataFrame>

In [18]:
expanded_questions.to_csv("expanded_questions.csv", index=False)
print("Expanded questions saved to expanded_questions.csv")

Expanded questions saved to expanded_questions.csv
