## BIOGPT
- Use biogpt pretrain model
- fine tune biogpt on medical dataset using distributed training.
 

# Import Libraries

In [None]:
!pip install transformers==4.27.4
!pip install sacremoses
from transformers import BioGptTokenizer, BioGptForCausalLM, TrainerCallback
from transformers import Trainer, TrainingArguments

# print(transformers.__version__)

In [None]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

In [None]:
!nvidia-smi

### Configurations

In [None]:
DEBUG           = False

INPUT_DIR       = 'articles'

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'microsoft/biogpt' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 256  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 8
    BATCH_UPDATE    = 2
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 4

EPOCHS          = 3
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020


DEVIDE_BY = 5

os.environ['WANDB_DISABLED'] = 'true'

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

### Using medical dataset

In [None]:
train_df = pd.read_csv('/kaggle/input/conversational-desc/train.csv')
test_df = pd.read_csv('/kaggle/input/conversational-desc/test.csv')

In [None]:
train_df = train_df.dropna()
train_df = train_df.astype('str')
test_df = test_df.dropna()
test_df = test_df.astype('str')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
sum = 0
sample_num = 500
for review in train_df.sample(sample_num).iloc[:, 1]:  # 0 for the translated recipe name 1 for translatedinstructions
    sum += len(review.split(' '))
print(sum/sample_num)

In [None]:
# For debug
train_df = train_df.sample(10000)
test_df = test_df.sample(int(len(test_df) / DEVIDE_BY / 5))
f'There are {len(train_df) :,} samples for training, and {len(test_df) :,} samples for validation testing'

### Datasets and loaders

In [None]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):
        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.title     = data.iloc[:, 0].tolist()
        self.text      = data.iloc[:, 1].tolist()


    #---------------------------------------------#

    def __len__(self):
        return len(self.text)

    #---------------------------------------------#
    
    def __getitem__(self, i):
        input = SPECIAL_TOKENS['bos_token'] + self.title[i] + SPECIAL_TOKENS['sep_token'] + self.text[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [None]:
def split_data(data, S=TRAIN_SIZE):
    train_data = data.sample(frac = TRAIN_SIZE)
    val_data = data.drop(train_data.index)

    return train_data, val_data

### Loading Tokenizer, Config and Model

In [None]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

# Load Pretrain model

In [None]:
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained(MODEL)

In [None]:
train_dataset = myDataset(train_df, tokenizer)
val_dataset = myDataset(test_df, tokenizer, randomize=False)
f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

### Fine-tune Biogpt using Trainer

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="./",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to = None,
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model() 

### Generating text with Fine-tuned Biogpt model

In [None]:
# !cp -r '/content/drive/MyDrive/Colab Notebooks/Text Generation/pytorch_model_V2.bin' 'pytorch_model.bin' 

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [None]:
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
model = BioGptForCausalLM.from_pretrained('/kaggle/working/')

In [None]:
title = "i am 24 year old male recently i have been suffering from knee pain what can be the cause of it?"
prompt = SPECIAL_TOKENS['bos_token'] + title + SPECIAL_TOKENS['sep_token'] 
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)
device = torch.device("cuda")
model.cuda()
model.eval();

In [None]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=10,                                 
                                top_p=0.9,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=10
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title)  
    print("{}: {}\n\n".format(i+1,  text[a:]))

## Applying Beam search technique

Beam search is a search algorithm used in natural language processing (NLP) to generate the most likely sequence of words in a sentence, given a language model. It explores multiple possible word choices at each step and keeps a limited set (beam width) of the most promising options. This helps to find higher-quality sequences and improve the accuracy of text generation tasks like machine translation and text generation.


In [None]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) 
    print("{}: {}\n\n".format(i+1,  text[a:]))

### Comparison with raw GPT2

In [None]:
tokenizer = get_tokenier()
model = get_model(tokenizer)

In [None]:
prompt = title

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval()
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))