In [2]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/gemma/keras/gemma_2b_en/2/config.json
/kaggle/input/gemma/keras/gemma_2b_en/2/tokenizer.json
/kaggle/input/gemma/keras/gemma_2b_en/2/metadata.json
/kaggle/input/gemma/keras/gemma_2b_en/2/model.weights.h5
/kaggle/input/gemma/keras/gemma_2b_en/2/assets/tokenizer/vocabulary.spm


# Finetune Cleaned Data

The goal of this workbook is to finetune dataset using KERAS and JAX and push the deployed model to Google bucket using a service account. The green section are done in this workbook

![](https://raw.githubusercontent.com/komus/MedQuAD/refs/heads/master/kaggleX%20Chatbot.drawio%20(1).png)

In [None]:
%pip install --upgrade --quiet keras-nlp
%pip install --upgrade --quiet keras
%pip install --upgrade --quiet accelerate sentencepiece transformers
%pip install --quiet google-cloud-secret-manager
%pip install --upgrade --quiet google-cloud-aiplatform

In [4]:
from kaggle_secrets import UserSecretsClient
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]="0.9"
user_secrets = UserSecretsClient()
os.environ['KAGGLE_KEY'] =  UserSecretsClient().get_secret("KAGGLE_KEY")
os.environ['KAGGLE_USERNAME'] = UserSecretsClient().get_secret("KAGGLE_USERNAME")
user_credential = UserSecretsClient().get_secret("KEYS")
s_auth = "key.json"
with open(s_auth, "w") as f:
    f.write(user_credential)
    
os.environ['AUTHS'] = s_auth

In [5]:
os.remove("key.json")

In [None]:
import datetime
import json
import locale

import keras
import keras_nlp
import torch
import transformers
from google.cloud import aiplatform
from numba import cuda

In [None]:
MODEL_NAME = "gemma_2b_en"
MODEL_SIZE = MODEL_NAME.split("_")[-2]
assert MODEL_SIZE in ("2b", "7b")
TRAIN_RATIO = 30
DATASET_NAME = "output_medplus"
DATASET_PATH = f"{DATASET_NAME}.jsonl"
DATASET_URL = f"https://raw.githubusercontent.com/komus/MedQuAD/refs/heads/master/output_medplus.jsonl"

FINETUNED_MODEL_DIR = f"./{MODEL_NAME}_{DATASET_NAME}"
FINETUNED_WEIGHTS_PATH = f"{FINETUNED_MODEL_DIR}/model.weights.h5"
FINETUNED_VOCAB_PATH = f"{FINETUNED_MODEL_DIR}/vocabulary.spm"

HUGGINGFACE_MODEL_DIR = f"./{MODEL_NAME}_huggingface"

PROJECT_ID = UserSecretsClient().get_secret("PROJECT_ID")
REGION = "us-central1"
BUCKET_URI = UserSecretsClient().get_secret("BUCKET_URI")
SERVICE_ACCOUNT = UserSecretsClient().get_secret("SERVICE_ACCT")
DEPLOYED_MODEL_URI = f"{BUCKET_URI}/{MODEL_NAME}"

In [None]:
!wget -nv -nc -O $DATASET_PATH $DATASET_URL

In [None]:
keras.config.set_floatx("bfloat16")

In [None]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(MODEL_NAME)

In [None]:
gemma_lm.summary()

In [None]:
TEST_EXAMPLES = [
     'As a healthcare fellow learning diagnosis, What is (are) Adhesions?',
    'As a healthcare fellow learning diagnosis, what research (or clinical trials) is being done for Miller Fisher Syndrome ?',
    'As a healthcare fellow learning diagnosis, What to do for Henoch-Schnlein Purpura '
]

# Prompt template for the training data and the finetuning tests
PROMPT_TEMPLATE = "Instruction:\n{instruction}\n\nResponse:\n{answer}"

TEST_PROMPTS = [
    PROMPT_TEMPLATE.format(instruction=example, answer="")
    for example in TEST_EXAMPLES
]

In [None]:
gemma_lm.compile(sampler="greedy")

In [None]:
import random
RANDOM_SEED = 3456
def generate_training_data(training_ratio: int = 100) -> list[str]:
    assert 0 < training_ratio <= 100
    data = []
    with open(DATASET_PATH) as file:
        for line in file.readlines():
            features = json.loads(line)
            data.append(PROMPT_TEMPLATE.format(**features))
    total_data_count = len(data)
    random.seed(RANDOM_SEED)
    random.shuffle(data)
    training_data_count = total_data_count * training_ratio // 100
    print(f"Training: {training_data_count}/{total_data_count}")

    return data[:training_data_count]

training_data = generate_training_data(training_ratio=TRAIN_RATIO)

In [None]:
gemma_lm.backbone.enable_lora(rank=4)

In [None]:
gemma_lm.summary()

In [None]:
def finetune_gemma(model: keras_nlp.models.GemmaCausalLM, data: list[str]):
    model.preprocessor.sequence_length = 256
    optimizer = keras.optimizers.AdamW(
        learning_rate=5e-5,
        weight_decay=0.01,
    )
    optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizer,
        weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
        sampler="greedy",
    )
    model.fit(data, epochs=1, batch_size=1)


finetune_gemma(gemma_lm, training_data)

In [None]:
for prompt in TEST_PROMPTS:
    output = gemma_lm.generate(prompt, max_length=None)
    print(f"{output}\n{'- '*40}")

In [None]:
%mkdir -p $FINETUNED_MODEL_DIR

gemma_lm.save_weights(FINETUNED_WEIGHTS_PATH)

gemma_lm.preprocessor.tokenizer.save_assets(FINETUNED_MODEL_DIR)

In [None]:
!du -shc $FINETUNED_MODEL_DIR/*

In [None]:
del gemma_lm

device = cuda.get_current_device()
cuda.select_device(device.id)
cuda.close()

In [None]:
!pip install --quiet keras_hub

## Convert model to Hugginface

In [None]:
!wget -nv -nc https://raw.githubusercontent.com/keras-team/keras-nlp/master/tools/gemma/export_gemma_to_hf.py

!KERAS_BACKEND=torch python export_gemma_to_hf.py \
    --weights_file $FINETUNED_WEIGHTS_PATH \
    --size $MODEL_SIZE \
    --vocab_path $FINETUNED_VOCAB_PATH \
    --output_dir $HUGGINGFACE_MODEL_DIR

In [None]:
model = transformers.GemmaForCausalLM.from_pretrained(
    HUGGINGFACE_MODEL_DIR,
    local_files_only=True,
    device_map="auto",  # Library "accelerate" to auto-select GPU
)
tokenizer = transformers.GemmaTokenizer.from_pretrained(
    HUGGINGFACE_MODEL_DIR,
    local_files_only=True,
)

In [None]:
def test_transformers_model(
    model: transformers.GemmaForCausalLM,
    tokenizer: transformers.GemmaTokenizer,
) -> None:
    for prompt in TEST_PROMPTS:
        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=500)

        output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"{output}\n{'- '*40}")


test_transformers_model(model, tokenizer)

In [None]:
del model, tokenizer

torch.cuda.empty_cache()

locale.getpreferredencoding = lambda: "UTF-8"

## Deploy Gemma to Bucket


In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
!gcloud auth activate-service-account --key-file $AUTHS

In [None]:
!gcloud storage rsync --verbosity error $HUGGINGFACE_MODEL_DIR $DEPLOYED_MODEL_URI

In [None]:
!gcloud storage du $DEPLOYED_MODEL_URI --readable-sizes