<a href="https://colab.research.google.com/github/idowujames/-Kenya-Clinical-Reasoning-Challenge/blob/main/Kenya_Clinical_Reasoning_Challenge_Training(BioGPT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# We begin to train our model, starting with by fine-tunning using BioGPT

In [1]:
# Install sacremoses for BioGPT
!pip install sacremoses --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m686.1/897.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m890.9/897.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
    )

### Loading the dataset

In [3]:
df = pd.read_csv('cleaned_dataset.csv')

print(df.shape)
dataset = Dataset.from_pandas(df)

(386, 14)


### Dataset splitting

In [4]:
# Using 90% for training and 10% for validation
train, val = dataset.train_test_split(test_size=0.1, seed=42).values()

print(train.shape)
print(val.shape)

(347, 14)
(39, 14)


### Tokenization

In [None]:
model_name = 'microsoft/biogpt'

tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Creating a tokenizer function
def tokenize_function(examples):
    # For each example, we combine the prompt and clinician response.
    # We use the tokenizer's end-of-sentence token as a separator and at the very end.
    # This teaches the model the full structure of a prompt-response pair.
    text_with_format = [prompt + tokenizer.eos_token + response + tokenizer.eos_token
                        for prompt, response in zip(examples['Prompt'], examples['Clinician'])]


    return tokenizer(text_with_format, truncation=True, max_length=256)


# Appling the function to the entire dataset.
tokenized_train_dataset = train.map(
    tokenize_function,
    batched=True,
    remove_columns=train.column_names
)

tokenized_test_dataset = val.map(
    tokenize_function,
    batched=True,
    remove_columns=val.column_names
)

print("Tokenization Complete")
print(tokenized_train_dataset[0])

### Training our model

In [None]:
# Fine-Tuning the BioGPT Model with our data

model = AutoModelForCausalLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=50,
    weight_decay=0.01,               # Regularization to prevent overfitting
    logging_steps=10,

    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,

    report_to="none",
)

# The data collator is a helper function that takes the tokenized samples and
# batches them together, handling padding dynamically.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,         # The dynamic padding helper
)

# Starting the Fine-Tuning
print("--- Starting Fine-Tuning ---")
trainer.train()

# 6. Save the fine-tuned model
final_model_path = "./final_biogpt_model"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model saved to {final_model_path}")

# Compressing and Downloading the Fine-tuned model As ZIP

In [None]:
import os
import zipfile
from google.colab import files
import shutil

def download_model_as_zip():

    model_folder = "./final_biogpt_model"
    zip_filename = "biogpt_model.zip"

    # Create zip file
    print(f"Compressing {model_folder} into {zip_filename}")
    shutil.make_archive("biogpt_model", 'zip', model_folder)

    # Checking file size
    zip_size = os.path.getsize(zip_filename) / (1024 * 1024)
    print(f"Zip file size: {zip_size:.2f} MB")

    # Downloading the zip file
    print("Starting download")
    files.download(zip_filename)
    print("Download complete! Check your Downloads folder.")

# Run this method
download_model_as_zip()

# Runing Inference On The Test Data

In [10]:
import torch

final_model_path = "./final_biogpt_model"

# Loading the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(final_model_path).to('cuda')

# Loading the tokenizer of our fine-tuned model.
tokenizer = AutoTokenizer.from_pretrained(final_model_path)

In [None]:
# Loading the test data
test_df = pd.read_csv('test.csv')
test_df.head()

### Starting the inference

In [None]:
generated_responses = []

print("Starting Inference on Test Data")

# Loop through each row in the test DataFrame
for index, row in test_df.iterrows():
    prompt_text = row['Prompt']

    # formating the input just like the training,
    # The eos_token signals to the model where the prompt ends and generation should begin.
    formatted_prompt = prompt_text + tokenizer.eos_token

    # Tokenizing the formatted prompt and move it to the GPU
    inputs = tokenizer(formatted_prompt, return_tensors='pt').to('cuda')

    # Generate the output from the model
    output_sequences = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=256,  # Max length for the generated response
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decoding the generated token IDs back into text
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    # --- Post-processing: Cleaning the output ---
    # The generated text will contain our original prompt. We need to remove it.
    # We find the length of the original prompt and take the substring that comes after it.
    final_response = generated_text[len(prompt_text):].strip()

    # Add the cleaned response to our list
    generated_responses.append(final_response)

    # Print progress every 10 prompts
    if (index + 1) % 10 == 0:
        print(f"Generated response for {index + 1}/{len(test_df)} prompts.")

print("Inference Complete")

In [None]:
# Adding the generated text to the test dataframe to see how well the text
# accurately answers the prompts
test_df['Generated Response'] = generated_responses

test_df.sample(5)

### From the outputs, it appears the fine-tuned model is prone to hallucination. The model's underlying medical knowledge learned from our small dataset is not robust enough, causing it to generate plausible-sounding but incorrect information.

### Next step after a few hyper-parameter tuning is to try fine-tuning another base model that would work better for our datase
