In [1]:
# --- Load fine-tuned model for inference on GPU ---

# 1. Install required libraries
print("Installing necessary libraries...")
!pip install -q -U transformers accelerate peft trl scipy datasets
!pip install -q -U "huggingface_hub[cli]"

# 2. Import libraries and setup environment
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Set environment for GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Change if needed to select GPU device
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

print("Libraries installed and environment ready for GPU.")

# 3. Login to Hugging Face using Kaggle Secrets
try:
    user_secrets = UserSecretsClient()
    secret_value = user_secrets.get_secret("HF_TOKEN")
    login(token=secret_value)
    print("\nLogged in to Hugging Face successfully.")
except Exception as e:
    print(f"\nLogin error: {e}")
    print("Make sure 'HF_TOKEN' is set in Kaggle Secrets with write access.")
    # login()  # Uncomment to login manually if needed

# 4. Set dataset and model paths
KAGGLE_DATASET_NAME = "my-medgemma-finetuned-model"  # Replace with your dataset name
model_path_in_dataset = f"/kaggle/input/farsi-tuned"
base_model_name = "google/medgemma-4b-it"

print(f"\nLoading fine-tuned model from: {model_path_in_dataset}")

# 5. Load tokenizer from fine-tuned model folder, fallback to base model tokenizer
try:
    tokenizer = AutoTokenizer.from_pretrained(model_path_in_dataset, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    print("Tokenizer loaded from fine-tuned model folder.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

# 6. Load base model for GPU with float32 precision and automatic device mapping
print(f"Loading base model '{base_model_name}' for GPU...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float32,  # Use float32 for GPU efficiency
    device_map="auto",          # Automatically distribute model on GPU(s)
)
print("Base model loaded for GPU.")

# 7. Apply LoRA adapters from the fine-tuned model folder
print(f"Applying LoRA adapters from {model_path_in_dataset}...")
model = PeftModel.from_pretrained(base_model, model_path_in_dataset)
print("LoRA adapters applied.")

# Set model to evaluation mode for inference
model.eval()
print("Model set to evaluation mode.")

# 8. (Optional) Merge LoRA adapters into base model for deployment (needs more RAM)
# model = model.merge_and_unload()
# print("LoRA adapters merged with base model.")

# 9. Create text generation pipeline for inference on GPU
print("\nCreating text generation pipeline...")
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float32,
    device_map="auto",
)

print("Fine-tuned model ready for inference!")

# 10. Test the model with a sample prompt
prompt_test = "سوال: چه عصبی در سندروم کارپل تونل به مشکل میخورد\nپاسخ:"
print(f"\n--- Test Question ---\n{prompt_test.replace('پاسخ:', '')}")
outputs_test = text_generator(
    prompt_test,
    max_new_tokens=100,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id,
)
generated_text_test = outputs_test[0]["generated_text"]
answer_test = generated_text_test.replace(prompt_test, "").strip()
print(f"Generated Answer: {answer_test}")


Installing necessary libraries...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.3/472.3 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00

2025-07-22 13:45:24.145145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753191924.324529      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753191924.380323      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Libraries installed and environment ready for GPU.

Logged in to Hugging Face successfully.

Loading fine-tuned model from: /kaggle/input/farsi-tuned
Tokenizer loaded from fine-tuned model folder.
Loading base model 'google/medgemma-4b-it' for GPU...


config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Base model loaded for GPU.
Applying LoRA adapters from /kaggle/input/farsi-tuned...


Device set to use cuda:0


LoRA adapters applied.
Model set to evaluation mode.

Creating text generation pipeline...
Fine-tuned model ready for inference!

--- Test Question ---
سوال: چه عصبی در سندروم کارپل تونل به مشکل میخورد

Generated Answer: در سندروم کارپل تونل، عصب مدیان به دلیل فشار و انقباض در کانال کارپل تونل (در مچ دست) دچار اختلال در عملکرد میشود.

با تشکر
دکتر محمدعلی نیک‌پور
متخصص مغز و اعصاب
تهران
[https://www.instagram.com/dr.nikpoor.m/](https://www.instagram.com/dr.nikpoor.m/)
[https://


In [2]:
from huggingface_hub import HfApi, Repository
import os

# --- تنظیمات ---
repo_name = "lbehradl/MedGemma_fa"  # نام ریپو روی Hugging Face
save_dir = "./merged_model"          # پوشه محلی که مدل ادغام شده توش ذخیره شده

# --- 1. مطمئن شو پوشه ذخیره مدل وجود داره ---
os.makedirs(save_dir, exist_ok=True)

# --- 2. ساخت ریپو در HF (اگر از قبل وجود داره، خطا نمیده) ---
api = HfApi()
api.create_repo(repo_id=repo_name, exist_ok=True)
print(f"Repo '{repo_name}' created or already exists.")

# --- 3. بارگذاری ریپو محلی و Push کردن مدل ---
repo = Repository(local_dir=save_dir, clone_from=repo_name)

repo.push_to_hub(commit_message="Upload merged fine-tuned MedGemma model")
print("Model successfully pushed to Hugging Face Hub.")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/lbehradl/MedGemma_fa into local empty directory.


Repo 'lbehradl/MedGemma_fa' created or already exists.
Model successfully pushed to Hugging Face Hub.
