In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![Gemma+lora](https://storage.googleapis.com/gweb-uniblog-publish-prod/images/gemma-header.width-1200.format-webp.webp)

## Setup

### Get access to Gemma

Step 1: The Gemma setup instructions show how to do the following:

Gemma models are hosted by Kaggle. To use Gemma, request access on Kaggle:

* Sign in or register at [kaggle.com](https://www.kaggle.com/)
* Open the [Gemma model card](https://www.kaggle.com/models/google/gemma) and select "Request Access"
* Complete the consent form and accept the terms and conditions

### Install dependencies
Install Keras, KerasNLP, and other dependencies.

In [None]:
# Install Keras 3 last. See https://keras.io/getting_started/ for more details.
!pip install -q -U keras-nlp
!pip install -q -U keras>=3
# !pip install -U datasets huggingface_hub fsspec

### Select a backend
Keras is a high-level, multi-framework deep learning API designed for simplicity and ease of use. Using Keras 3, we can run workflows on one of three backends: TensorFlow, JAX, or PyTorch.

For this, configure the backend for Pytorch.

In [None]:
import os

os.environ["KERAS_BACKEND"] = "jax"  # Or "torch" or "tensorflow".
# Avoid memory fragmentation on JAX backend.
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="1.00"

### Import packages
Import Keras and KerasNLP.

In [None]:
import keras
import keras_nlp

Dataset

This ["MT Samples"](https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions) dataset provides a solution to the challenge of acquiring medical data due to HIPAA privacy regulations by offering sample medical transcriptions from various specialties. Comprising 5,000 rows, it enables the classification of medical specialties based on the transcription text. The data was sourced from mtsamples.com, and it serves as a valuable resource for researchers and developers aiming to enhance their understanding of medical language processing and improve classification models in healthcare applications.

The ["Medical Transcripts"](https://huggingface.co/datasets/DataFog/medical-transcription-instruct) dataset contains 38,924 samples of instruct-input-output data tailored for training instruction-following models in the medical field. It is sourced from original medical transcriptions and formatted in CSV, featuring instruction-output pairs. Each row includes an instruction, task output, transcription text, description, medical specialty, sample name, and both original and derived keywords. Additional columns provide transcription length, normalized length, and a complexity score. Tasks include identifying medical specialties, summarizing transcriptions, extracting keywords, assessing text complexity, and suggesting follow-up questions. The dataset focuses on the medical domain, supporting diverse instruction-following tasks in healthcare.


### Load Dataset

In [None]:
import pyarrow.parquet as pq
import pandas as pd

# Step 1: Read Parquet file
df = pd.read_csv('/kaggle/input/medicaltranscriptions/mtsamples.csv')

# df = pd.read_csv("hf://datasets/DataFog/medical-transcription-instruct/datafog-medical-transcription-instruct.csv")

# # Step 2: Convert table to DataFrame
# df = table.to_pandas()

# Step 3: Access data
df.head()  # Display first few rows of the DataFrame


In [None]:
# from sklearn.model_selection import train_test_split

# train, test = train_test_split(df, test_size=0.8)

# # Further split the train set to keep exactly 27,000 in the train set
# train_final, extra_for_test = train_test_split(train, test_size=1677/len(train), random_state=42)

# # Use pd.concat to combine the extra_for_test with the test set
# test_final = pd.concat([test, extra_for_test])

In [None]:
# Initialize an empty list to store the formatted transcriptions and keywords
formatted_data = []

# Iterate over each row of the DataFrame
for index, row in df.iterrows():
    # Extract messages
    transcription = row['transcription']
    keywords = row['keywords']
#     instructions = row['instruction']
#     task_output = row['task_output']
    description = row['description']
    medical_specialty = row['medical_specialty']
    
    # Create the formatted string
#     formatted_string = f'Transcription:\n{transcription}\n\nKeywords:\n{keywords}\n\nMedical Specialty:{medical_specialty}\n\nInstruction:{instructions}\n\nOutput:{task_output}'
    formatted_string = f'Transcription:\n{transcription}\n\nKeywords:\n{keywords}\n\nMedical Specialty:{medical_specialty}\n\nDescription: {description}'
    
    # Append the formatted string to the list
    formatted_data.append(formatted_string)

# Print the formatted data
for item in formatted_data:
    print(item)
    break
    


In [None]:
# formatted_data = formatted_data[:200]
print(len(formatted_data))

### Load Model

KerasNLP provides implementations of many popular model architectures. In this, the model is created using GemmaCausalLM, an end-to-end Gemma model for causal language modeling. A causal language model predicts the next token based on previous tokens.

Create the model using the from_preset method:

In [None]:
# import shutil

# # Zip the file so it can be downloaded
# shutil.make_archive('/kaggle/working/gemma-medtr-2b-v2', 'zip', '/kaggle/working/gemma-medtr-2b-v2')

# # The file will be available for download from the output section of your notebook


In [None]:
gemma_model = keras_nlp.models.GemmaCausalLM.from_preset('/kaggle/input/gemma/keras/gemma_2b_en/2')
gemma_model.summary()


The from_preset method instantiates the model from a preset architecture and weights. In the code above, the string "gemma_2b_en" specifies the preset architecture — a Gemma model with 2 billion parameters.

### Inference before fine tuning
In this section, we will query the model with various prompts to see how it responds.

#### laparoscopic gastric bypass Prompt
Query the model for laparoscopic gastric bypass transcription.

In [None]:
prompt = "tell me something about treating laparoscopic gastric bypass with an example"
print(gemma_model.generate(prompt, max_length=256))

### LoRA Fine-tuning

To get better responses from the model, fine-tune the model with Low Rank Adaptation (LoRA) using the mt samples dataset.

LoRA (Low-Rank Adaptation of Large Language Models) has gained popularity as a lightweight training method that slashes the number of parameters to train. Instead of adjusting every single weight in the model, LoRA adds a smaller set of new weights and focuses training solely on them. This approach speeds up training, saves memory, and generates smaller model sizes (just a few 100 MBs), making them simpler to store and share. To speed up training more, it can also be combined with other training techniques like dreambooth.

Having a higher rank allows for more detailed adjustments, which can enhance precision but also increases the number of parameters to train. On the other hand, a lower rank reduces computational burden but might result in less precise adaptation.

For this, a LoRA rank of 4 is utilized. It's recommended to start with a relatively small rank, like 4, 8, or 16, for computational efficiency during experimentation. Train model with rank=4 initially and assess its performance improvement on specific task. As we progress, we can gradually increase the rank in subsequent experiments to observe if it leads to further enhancements in performance.

In [None]:
# Enable LoRA for the model and set the LoRA rank to 8.
gemma_model.backbone.enable_lora(rank=8)
gemma_model.summary()

> Note that enabling LoRA reduces the number of trainable parameters significantly (from 2.5 billion to 1.3 million).

### Hyperparameters for lora
We are defining only rank, rest will be default hyperparameters.

Rank (r): 4
We opted for a rank of 4 in our decomposition matrices to maintain efficiency gains while still achieving solid performance. Although testing higher ranks, like 8 or 16, showed minimal performance improvements, although it is recommended to sticking with 8 if harware compatibility is there it helps to keep checkpoint sizes manageable without sacrificing too much accuracy.

Alpha (lora_alpha):
Alpha scales the learned weights. Based on existing [literature](https://arxiv.org/pdf/2308.07317v1.pdf) and recommendations from the original [LoRA paper](https://arxiv.org/abs/2106.09685). Keeping Alpha fixed rather than treating it as a tunable hyperparameter is a common practice in the LLM community and suggested value is 16.

Target Modules: All Dense Layers
While the original LoRA paper focused on fine-tuning only the "Q" and "V" attention matrices, subsequent research suggested that targeting [additional layers](https://arxiv.org/pdf/2110.04366.pdf), or even [all layers](https://arxiv.org/abs/2305.14314), could yield better results. [At backend](https://github.com/keras-team/keras-nlp/blob/v0.8.1/keras_nlp/models/backbone.py#L156) code targets the "query_dense" and "value_dense" layers in the attention layers for enabling LoRA.

Base Learning Rate: 1e-4
Base learning rate of 1e-4 has become the standard for fine-tuning LLMs with LoRA. Although [some](https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2) have encountered occasional training loss instabilities, lowering the learning rate to values like 3e-5 helped them stabilize the process.

In [None]:
import numpy as np
import torch
torch.cuda.empty_cache()
np.asarray(formatted_data).shape

In [None]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_model.preprocessor.sequence_length = 512
# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)
# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

gemma_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)
gemma_model.fit(formatted_data, epochs=4, batch_size=1)

In [None]:
!pip install --upgrade kagglehub

In [None]:
gemma_model.save_to_preset("./gemma_medtr_2b_v5")

In [None]:
kaggle_uri = "kaggle://harishiker99/gemma_medtr_2b_v5/keras/gemma_medtr_2b_v5"
keras_nlp.upload_preset(kaggle_uri, "./gemma_medtr_2b_v5")

In [None]:
hf_username = "harishnair04"
model_variant_name = "gemma-medtr-2b-v5"


uri = f"hf://{hf_username}/{model_variant_name}"
print(uri)


import huggingface_hub
huggingface_hub.login(token="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",add_to_git_credential=True)

keras_nlp.upload_preset(uri, "/kaggle/working/gemma_medtr-2b-v5")

### Inference after fine-tuning

We're training the model on only a small part of the dataset for multiple epoch and with a low LoRA rank setting. If improved responses are required from the fine-tuned model, we might want to try out the following:

- Expanding the size of the dataset used for fine-tuning.
- Adjusting the LoRA rank to a higher value.
- Increasing the number of training steps (epochs).
- Tweaking the hyperparameter values like learning_rate and weight_decay.

In [None]:
prompt="tell me something about treating laparoscopic gastric bypass with an example"
print(gemma_model.generate(prompt, max_length=256))