In [None]:
!pip install datasets
!pip install transformers
!pip install huggingface_hub

In [2]:
from datasets import load_dataset

data_en = load_dataset('head_qa', 'en')

In [3]:
train_data=data_en["train"]
val_data=data_en["validation"]

## Fine tuning GPT3 on topic

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m")

In [5]:
topics=["biology","nursery","psychology","chemistry","pharmacology"]

In [6]:
qa_dct={}
qa_dct_val={}
def all_topics(topic):
    def topic_qa(example):
        return example["category"]==topic
    
    data_train_topic=train_data.filter(topic_qa)
    data_val_topic=val_data.filter(topic_qa)
    print(f"(Topic-{topic.capitalize()})")

    
    for example in data_train_topic:
        answer=[x["atext"] for x in example["answers"] if x["aid"]==example["ra"]][0]
        k=0
        for e in example["answers"]:
            if e["aid"]==example["ra"]:
                correct_char=chr(65+k)
            k+=1
        ques=topic.capitalize()+":\n"+example["qtext"]
        qa_dct[ques]="\n".join([f"{chr(65+i)}) "+x["atext"] for i,x in enumerate(example["answers"])])+f"\n Correct:{correct_char}) "+answer
        
    
    for example in data_val_topic:
        answer=[x["atext"] for x in example["answers"] if x["aid"]==example["ra"]][0]
        k=0
        for e in example["answers"]:
            if e["aid"]==example["ra"]:
                correct_char=chr(65+k)
            k+=1
        ques=topic.capitalize()+":\n"+example["qtext"]
        qa_dct_val[ques]="\n".join([f"{chr(65+i)}) "+x["atext"] for i,x in enumerate(example["answers"])])+f"\n Correct:{correct_char}) "+answer
    

In [7]:
for topic in topics:
    all_topics(topic)
    
    


(Topic-Biology)
(Topic-Nursery)
(Topic-Psychology)
(Topic-Chemistry)
(Topic-Pharmacology)


In [8]:
import random
items=list(qa_dct.items())
random.shuffle(items)
qa_dct=dict(items)

items=list(qa_dct_val.items())
random.shuffle(items)
qa_dct_val=dict(items)


In [9]:
import re

def build_text_files(qa_dct,dest_path):
    f = open(dest_path, 'w',encoding="utf-8")
    data = ''
    for key,value in qa_dct.items():

        summary =key+"\n"+value
    
        data += summary + "  "+"\n\n"
    data=data.encode("utf-8").decode("utf-8")
    f.write(data)



build_text_files(qa_dct,'train_dataset.txt')
build_text_files(qa_dct_val,'test_dataset.txt')

In [10]:
from transformers import TextDataset,DataCollatorForLanguageModeling
train_path=f'train_dataset.txt'
test_path=f'test_dataset.txt'
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=180)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=180)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)


2023-11-06 13:26:44.429621: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-06 13:26:44.473325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
import torch
with torch.no_grad():
    torch.cuda.empty_cache()

In [13]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead


training_args = TrainingArguments(
    output_dir=f"./gpt-all", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=8000, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,

)


In [14]:
trainer.train()

Step,Training Loss
500,2.1434
1000,1.6433
1500,1.2832
2000,0.9875
2500,0.7817
3000,0.6408


TrainOutput(global_step=3050, training_loss=1.236273718036589, metrics={'train_runtime': 679.0518, 'train_samples_per_second': 17.952, 'train_steps_per_second': 4.492, 'total_flos': 1119415259750400.0, 'train_loss': 1.236273718036589, 'epoch': 10.0})

In [15]:
trainer.evaluate()

{'eval_loss': 2.5818235874176025,
 'eval_runtime': 8.6375,
 'eval_samples_per_second': 65.875,
 'eval_steps_per_second': 8.336,
 'epoch': 10.0}

In [16]:
trainer.save_model()


In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m")

model_trained = AutoModelForCausalLM.from_pretrained(f"./gpt-all")

In [48]:
from transformers import pipeline
generator=pipeline('text-generation',model=model_trained, tokenizer=tokenizer,
                   device=-1,  # GPU index to use, -1 for CPU
   
                    max_length=148,
                    do_sample=True,
                    top_p=0.9,
                    top_k=0,
                    num_return_sequences=1,)

In [56]:
topic="chemistry"

In [57]:
prompt=f"{topic.capitalize()}:"
result=generator(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [58]:
print(result[0]["generated_text"].split("Correct")[0],"Correct",result[0]["generated_text"].split("Correct")[1].split("\n")[0].strip())

Chemistry:
Vasioligo is an acid:
A) To the iodo. acid.
B) To the iodine.
C) To the iodide.
D) With the chloride.
E) With the iodide.
  Correct :D) With the chloride.


In [None]:
token="your_token"
from huggingface_hub import notebook_login

notebook_login()

In [None]:


model_trained.push_to_hub("gpt_head_qa")
tokenizer.push_to_hub("gpt_head_qa")