In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
tqdm.pandas()

**Data Preprocessing**

In [3]:
medical_data = pd.read_json("datasets/final.json")

In [4]:
display(medical_data)

Unnamed: 0,question,answer
0,My wife Misscarried after 8 week and doctor as...,helloyour semen analysis suggests low sperm co...
1,What information can be obtained regarding Acy...,Before taking acyclovir:\ntell your doctor and...
2,Are low plasma adiponectin levels associated w...,These results demonstrate for the first time a...
3,Does helicobacter pylori infection reduce syst...,H. pylori substantially impairs the bio-availa...
4,Does g-protein-coupled bile acid receptor play...,This study suggests that TGR5 may play a role ...
...,...,...
473971,"helo sir, my 2 year old daughter have constant...","hello fatima, welcome to chatbot forum. your h..."
473972,I had an MRI scan of my brain last night at 6,"hi, thank you for posting your query. we need ..."
473973,Does vitamin D Status be Related to Oxidative ...,Our findings show lower levels of TAC and 25(O...
473974,Do serum interleukin 8 and 12 levels predict s...,Serum IL-8 and -12 levels were markedly elevat...


In [5]:
medical_data['combined'] = "Question : " + medical_data['question'] + " Answer : " + medical_data['answer']

**Initialization of the model and tokenizer**

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")

In [7]:
# Function to count words in a text
def word_count(text):
    return len(text.split(' '))

# Create a new column for word count
medical_data['word_count'] = medical_data['combined'].apply(word_count)


In [8]:
len(medical_data)

473976

**Keep samples with length < 1020**

In [9]:
medical_data = medical_data[medical_data['word_count'] < 1020]

**Splitting the dataframe into training and validation and test**

In [10]:
# Split the dataset
train_med, test_med = train_test_split(medical_data, test_size=0.2)
val_med, test_med = train_test_split(test_med, test_size=0.5)

In [11]:
def tokenize_dataset(dataset):
    tokenized_texts = []
    for text in tqdm(dataset['combined'].tolist()[:1000], desc="Tokenizing"):
        text = text.lower()
        tokenized_texts.append(tokenizer(text, truncation=True, padding='max_length', max_length=1024, return_tensors='pt'))
    return tokenized_texts

In [12]:
tokenized_val = tokenize_dataset(val_med)
tokenized_test = tokenize_dataset(test_med)

Tokenizing: 100%|██████████| 1000/1000 [00:01<00:00, 989.16it/s]
Tokenizing: 100%|██████████| 1000/1000 [00:00<00:00, 1135.44it/s]


In [30]:
class BioGPTDataset(Dataset):
    def __init__(self, encodings, tokenizer):
        self.encodings = encodings
        self.pad_token_id = tokenizer.pad_token_id

    def __getitem__(self, idx):
        # Retrieves the input_ids and attention_mask for the given index
        item = {key: torch.squeeze(val) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.cat([item['input_ids'][1:], torch.tensor([self.pad_token_id])])
        return item

    def __len__(self):
        # Returns the length of the input_ids
        return len(self.encodings)

def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask, 
        "labels": labels
    }

In [28]:
val_dataset = BioGPTDataset(tokenized_val, tokenizer)
test_dataset = BioGPTDataset(tokenized_test, tokenizer)

In [173]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model checkpoints
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Weight decay if we apply some
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,                # Log every X updates steps
    evaluation_strategy="steps",     # Evaluation is done (and logged) every X steps
    save_steps=10,                   # Model is saved every X steps
    load_best_model_at_end=True      # Load the best model at the end of training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=val_dataset,       # Training dataset
    eval_dataset=test_dataset,       # Evaluation dataset
)

In [None]:
trainer.train()