In [1]:
# --- ورود به حساب کاربری Hugging Face ---
# این کد را پس از تنظیم 'HF_TOKEN' در Kaggle Secrets، در یک سلول جداگانه اجرا کنید.
from huggingface_hub import login
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("HF_TOKEN")
try:
    login(token=secret_value)
    print("\nSuccessfully logged in to Hugging Face using Kaggle Secret.")
except:
    print("\nError: HF_TOKEN Kaggle Secret not found. Please ensure it's created and attached.")
    print("You can try manual login by uncommenting the line below and running again.")
    # login() # uncomment this line to try manual login if secret method fails



Successfully logged in to Hugging Face using Kaggle Secret.


In [2]:
from datasets import load_dataset

print("Loading the gaokerena/MF3QA dataset from Hugging Face Hub...")
try:
    dataset = load_dataset("gaokerena/MF3QA")
    print("Dataset loaded successfully.")
    print("\nDataset structure:")
    print(dataset)
    
    if 'train' in dataset:
        train_dataset = dataset['train']
    else:
        train_dataset = dataset[list(dataset.keys())[0]]

    print(f"\nNumber of examples in the training split: {len(train_dataset)}")
    print("\nFirst 5 raw examples from the dataset:")
    for i in range(min(5, len(train_dataset))):
        print(f"--- Example {i+1} ---")
        # *** اصلاح شده: استفاده از 'Question' و 'Answer' با حرف بزرگ ***
        print(f"Question: {train_dataset[i]['Question']}")
        print(f"Answer: {train_dataset[i]['Answer']}")

    # --- فرمت بندی دیتاست برای Instruction Tuning ---
    def format_example(example):
        # *** اصلاح شده: استفاده از 'Question' و 'Answer' با حرف بزرگ ***
        question = str(example.get('Question', '')).strip()
        answer = str(example.get('Answer', '')).strip()

        formatted_text = f"سوال: {question}\nپاسخ: {answer}"
        return {"text": formatted_text}

    print("\nFormatting the dataset into 'text' column...")
    # remove_columns باید با نام ستون های اصلی دیتاست مطابقت داشته باشد.
    formatted_dataset = train_dataset.map(format_example, remove_columns=train_dataset.column_names)
    
    print("\nFirst 3 formatted examples:")
    for i in range(min(3, len(formatted_dataset))):
        print(f"--- Formatted Example {i+1} ---")
        print(formatted_dataset[i]['text'])

    print("\nDataset preparation for fine-tuning is complete. Ready for model loading and tokenization.")

except Exception as e:
    print(f"An error occurred during dataset loading or preparation: {e}")
    print("Please ensure you have successfully logged in to Hugging Face and your internet connection is stable.")

Loading the gaokerena/MF3QA dataset from Hugging Face Hub...
Dataset loaded successfully.

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer', 'Source'],
        num_rows: 20000
    })
    dev: Dataset({
        features: ['Question', 'Answer', 'Source'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['Question', 'Answer', 'Source'],
        num_rows: 2000
    })
})

Number of examples in the training split: 20000

First 5 raw examples from the dataset:
--- Example 1 ---
Question: ۱ماهپیش از خواب پریدم از ترس شدید بعداز اون ترس،فشارم بالا میرفت ۱۴ ۱۵ با سر درد و درد قفسه سینه همراه بود با پرانول کنترش میکردم ولی الان ۲ ۳ روزه فشارم میاد پاین ۸ ۹ روی ۳ ۴ وقتایی که میرم پیاده روی یا کلا فعالیتی دارم بعدش ضربانم تا ۶ ۷ ساعت حتی بیشتر تنده و کند و طبیعی نمیشه خستهه شدم دیگهمیترسم اتفاقی برام بیوفته فشار ۸ ۹ روری ۴ ۵ خطرناکه؟
Answer: تجربه‌ای که شرح داده‌اید، نشان‌دهنده‌ی رخدادهای فیزیولوژیک و احتمالاً پاتولوژیک در بدن شما است

In [3]:
print("Ensuring all necessary libraries are installed...")
!pip install -q -U transformers peft trl bitsandbytes scipy datasets
!pip install -q -U "huggingface_hub[cli]"
!git config --global user.name "lbehradl"
!git clone https://github.com/unslothai/unsloth.git
!pip install -q -U ./unsloth
!pip install -q -U unsloth_zoo
print("Libraries installation/update complete.")


Ensuring all necessary libraries are installed...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth-zoo 2025.7.8 requires datasets<4.0.0,>=3.4.1, but you have datasets 4.0.0 which is incompatible.[0m[31m
[0mfatal: destination path 'unsloth' already exists and is not an empty directory.
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
Libraries installation/update complete.


In [4]:
import transformers 
transformers

<module 'transformers' from '/usr/local/lib/python3.11/dist-packages/transformers/__init__.py'>

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import unsloth
try:
    from unsloth import FastLanguageModel
    print("Unsloth detected. Using FastLanguageModel for optimized loading.")
    USE_UNSLOTH = True
except ImportError:
    print("Unsloth not found. Falling back to standard Hugging Face loading.")
    USE_UNSLOTH = False
print('imported')




2025-07-21 19:14:26.148307: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753125266.457614     328 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753125266.538149     328 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth detected. Using FastLanguageModel for optimized loading.
imported


In [7]:
model_name = "google/medgemma-4b-it"  # یا "google/medgemma-27b"

# --- تنظیمات کوانتیزیشن (Quantization) ---
# برای کاهش مصرف حافظه GPU، مدل را به 4-bit کوانتیزه می کنیم (QLoRA).
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",              # نوع کوانتیزیشن
    bnb_4bit_compute_dtype=torch.bfloat16, # نوع داده برای محاسبات
    bnb_4bit_use_double_quant=True,         # کوانتیزیشن دوگانه برای کاهش بیشتر حافظه
)

# --- بارگذاری توکنایزر ---
tokenizer = AutoTokenizer.from_pretrained(model_name)

# اطمینان از تنظیم pad_token برای توکنایزر (مهم برای مدل‌های decoder-only)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # استفاده از eos_token به عنوان pad_token

tokenizer.padding_side = "right"  # پدینگ از سمت راست (توصیه شده)

# --- بارگذاری مدل با کوانتیزیشن ---
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",         # توزیع مدل روی GPU موجود
    torch_dtype=torch.bfloat16 # استفاده از bfloat16 برای محاسبات
)

# آماده سازی مدل برای آموزش LoRA با کوانتیزیشن 4-bit
model = prepare_model_for_kbit_training(model)

print(f"\nModel '{model_name}' and Tokenizer loaded successfully.")
print("\nModel structure (first few layers):")
print(model)
print("\nTokenizer padding side set to 'right'.")
print("Ready for LoRA configuration and training.")

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]


Model 'google/medgemma-4b-it' and Tokenizer loaded successfully.

Model structure (first few layers):
Gemma3ForConditionalGeneration(
  (model): Gemma3Model(
    (vision_tower): SiglipVisionModel(
      (vision_model): SiglipVisionTransformer(
        (embeddings): SiglipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
          (position_embedding): Embedding(4096, 1152)
        )
        (encoder): SiglipEncoder(
          (layers): ModuleList(
            (0-26): 27 x SiglipEncoderLayer(
              (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
              (self_attn): SiglipAttention(
                (k_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
                (v_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
                (q_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
                (out_proj): Linear4bit(in_features=1