# Importing Libraries

In [None]:
import json
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorForLanguageModeling, GPT2LMHeadModel, Trainer, TrainingArguments, AutoTokenizer

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# Importing Data

In [None]:
path = '/content/drive/MyDrive/data (1)/train.json'

data_json = []
with open(path, 'r') as file:
    for line in file:
        data_json.append(json.loads(line))

data_extracted = [
    {'question': entry['question'], 'cop': entry['cop'], 'subject_name': entry['subject_name'],
     'topic_name': entry['topic_name'], 'exp': entry['exp'], 'opa': entry['opa'],
     'opb': entry['opb'], 'opc': entry['opc'], 'opd': entry['opd']}
    for entry in data_json
]

data_df = pd.DataFrame(data_extracted)

data_df['topic_name'] = data_df['topic_name'].fillna('Unknown')
data_df['exp'] = data_df['exp'].fillna('')

                                            question  cop   subject_name  \
0  Chronic urethral obstruction due to benign pri...    3        Anatomy   
1  Which vitamin is supplied from only animal sou...    3   Biochemistry   
2  All of the following are surgical options for ...    4        Surgery   
3  Following endaerectomy on the right common car...    1  Ophthalmology   
4   Growth hormone has its effect on growth through?    2     Physiology   

                   topic_name  \
0               Urinary tract   
1       Vitamins and Minerals   
2  Surgical Treatment Obesity   
3                     Unknown   
4                     Unknown   

                                                 exp  \
0  Chronic urethral obstruction because of urinar...   
1  Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...   
2  Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...   
3  The central aery of the retina is a branch of ...   
4  Ans. is 'b' i.e., IGI-1GH has two major functi...   

       

In [None]:
df = data_df.copy()

In [None]:
# Create formatted_data list
formatted_data = []

for idx, row in df.iterrows():
    answer = row[f'op{chr(96 + row["cop"])}']
    formatted_data.append({
        'question': row['question'],
        'opa': row['opa'],
        'opb': row['opb'],
        'opc': row['opc'],
        'opd': row['opd'],
        'answer': answer
    })

In [None]:
# Load the data into Hugging Face's Dataset format
dataset = Dataset.from_list(formatted_data)

# Loading the Model


In [None]:
# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
tokenizer.pad_token = tokenizer.eos_token

# Tokenize function
def tokenize_function(examples):
    input_text = (
        f"Question: {examples['question']}\n"
        f"Options:\nA) {examples['opa']}\nB) {examples['opb']}\nC) {examples['opc']}\nD) {examples['opd']}\nAnswer:"
    )

    return tokenizer(
        input_text,
        truncation=True,
        padding='max_length',
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=False)

Map:   0%|          | 0/182822 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [None]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Fine-tuning GPT


In [None]:
# Set up training arguments optimized for A100
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-options",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=1e-5,
    fp16=True,
    dataloader_num_workers=4,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
)

# Initialize Trainer with the model, training arguments, datasets, and collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()
trainer.save_model("./gpt2-finetuned-options")

Epoch,Training Loss,Validation Loss
1,1.8577,1.869174
2,1.8446,1.851821
3,1.8657,1.833454
4,1.8316,1.820681
5,1.8097,1.808119
6,1.7971,1.798098
7,1.7868,1.789116
8,1.7653,1.780307
9,1.761,1.7732
10,1.7444,1.766796


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [None]:
# Save the final model and tokenizer
trainer.save_model("./gpt2-finetuned-med-exam")
tokenizer.save_pretrained("./gpt2-finetuned-med-exam")

('./gpt2-finetuned-med-exam/tokenizer_config.json',
 './gpt2-finetuned-med-exam/special_tokens_map.json',
 './gpt2-finetuned-med-exam/vocab.json',
 './gpt2-finetuned-med-exam/merges.txt',
 './gpt2-finetuned-med-exam/added_tokens.json',
 './gpt2-finetuned-med-exam/tokenizer.json')

In [None]:
# Push the model and tokenizer to Hugging Face Hub
model.push_to_hub("Jaiminshahh/finetuned-gpt2", use_temp_dir=True)
tokenizer.push_to_hub("Jaiminshahh/finetuned-gpt2", use_temp_dir=True)

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jaiminshahh/finetuned-gpt2/commit/03f5f26212664695c435da386077ee106d3835b7', commit_message='Upload tokenizer', commit_description='', oid='03f5f26212664695c435da386077ee106d3835b7', pr_url=None, pr_revision=None, pr_num=None)