In [None]:
import sys
import subprocess

# 1. Kiểm tra xem Python đang chạy ở đâu
print(f"Python Kernel path: {sys.executable}")

# 2. Cài đặt với phiên bản CHÍNH XÁC từ Docker (torch 2.9.1)
# Downgrade torchaudio và torchvision về phiên bản tương thích
print(">>> Cài đặt với phiên bản khớp với Docker (torch 2.9.1)...")
packages = [
    "torch==2.9.1",
    "torchaudio==2.9.1",
    "torchvision==0.24.1",  # Phiên bản tương thích với torch 2.9.1
    "sentence-transformers==5.2.0",
    "transformers==4.57.6",
    "datasets",
    "accelerate", 
    "einops"
]
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade"] + packages)

# 3. Xác nhận phiên bản đã cài
try:
    from sentence_transformers import SentenceTransformer
    import sentence_transformers
    import torch
    import transformers
    print("\n✅ Cài đặt và Import THÀNH CÔNG!")
    print(f"torch version: {torch.__version__}")
    print(f"sentence-transformers version: {sentence_transformers.__version__}")
    print(f"transformers version: {transformers.__version__}")
except ImportError as e:
    print(f"\n❌ Vẫn lỗi: {e}")
except Exception as e:
    print(f"\n❌ Lỗi: {e}")

Python Kernel path: /venv/main/bin/python
Collecting sentence-transformers
  Using cached sentence_transformers-5.2.2-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Using cached datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Using cached accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting einops
  Using cached einops-0.8.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers<6.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-1.3.5-py3-none-any.whl.metadata (13 kB)
Collectin

[0m


>>> Cài đặt và Import THÀNH CÔNG!


In [1]:
import os
import json
import torch
import shutil

from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader

In [None]:
# --- 0. Cấu hình ---
MODEL_ID = "Qwen/Qwen3-Embedding-0.6B"
BATCH_SIZE = 16
NUM_EPOCHS = 10
OUTPUT_PATH = "./output_qwen_embedding_finetuned"

# --- 1. ĐỌC DỮ LIỆU ---
# --- 1. ĐỌC DỮ LIỆU (Đã sửa theo JSON mẫu) ---
print(">>> Đang đọc dữ liệu...")

data_path = 'train_dataset.json'
train_examples = []

if not os.path.exists(data_path):
    print(f"Lỗi: Không tìm thấy file {data_path}.")
    data = []
else:
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # data = data[:300]  # Bỏ comment nếu muốn test nhanh

for i, entry in enumerate(data):
    # 1. Lấy Query
    query = entry.get('query', "").strip()

    # 2. Xử lý Positive (Sửa lỗi AttributeError)
    pos_raw = entry.get('positive', "")
    
   
    pos_text = str(pos_raw)

    pos_text = pos_text.replace("nan nan", "").strip()

    if not query or not pos_text:
        continue

    texts = [query, pos_text]

    neg_list = entry.get('negatives', entry.get('hard_negatives', []))

    if isinstance(neg_list, list):
        for neg in neg_list:
            neg_text = ""
            if isinstance(neg, str):
                neg_text = neg.strip()
            
            if neg_text:
                texts.append(neg_text)

    # Tạo InputExample
    train_examples.append(InputExample(texts=texts))

if not train_examples:
    raise ValueError("Không có dữ liệu training!")

# In thử mẫu đầu tiên để kiểm tra
print(f"--- Mẫu dữ liệu đầu tiên ---")
print(f"Query:    {train_examples[0].texts[0]}")
print(f"Positive: {train_examples[0].texts[1]}")
if len(train_examples[0].texts) > 2:
    print(f"Negative: {train_examples[0].texts[2]}")
print(f"--------------------------")
print(f"Tổng số mẫu training: {len(train_examples)}")

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# --- 2. TẢI MODEL QWEN EMBEDDING ---
print(f">>> Đang tải model {MODEL_ID}...")

# Cấu hình Transformer base
word_embedding_model = models.Transformer(
    MODEL_ID,
    max_seq_length=512,
    model_args={
        "trust_remote_code": True,
    }
)

# THÊM pooling
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode="mean"
)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

tokenizer = word_embedding_model.tokenizer
tokenizer.padding_side = "right"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# --- 3. TRAIN ---
train_loss = losses.MultipleNegativesRankingLoss(model=model)

print(">>> Bắt đầu training...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=int(len(train_dataloader) * 0.1),
    output_path=OUTPUT_PATH,
    show_progress_bar=True,              
)

print(">>> Training hoàn tất!")

# --- 4. LƯU MODEL ---
print(f">>> Lưu model vào {OUTPUT_PATH}...")
model.save(OUTPUT_PATH)

# --- 5. NÉN FILE ZIP ---
zip_filename = "qwen_embedding_finetuned"
print(f">>> Nén thành {zip_filename}.zip...")
shutil.make_archive(zip_filename, "zip", OUTPUT_PATH)

file_size = os.path.getsize(zip_filename + ".zip") / (1024 * 1024)
print(f"Kích thước file: {file_size:.2f} MB")


>>> Đang đọc dữ liệu...
--- Mẫu dữ liệu đầu tiên ---
Query:    Oversee the coordination of music production personnel, including delegating responsibilities for orchestration, music copying, and vocal coaching.
Positive: manage musical staff. Assign and manage staff tasks in areas such as scoring, arranging, copying music and vocal coaching.
Negative: Supervise the technical layout of musical notation on the staff, ensuring that all symbols and clefs are correctly positioned for score production.
--------------------------
Tổng số mẫu training: 300
>>> Đang tải model Qwen/Qwen3-Embedding-0.6B...


Loading weights:   0%|          | 0/310 [00:00<?, ?it/s]



>>> Bắt đầu training...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

>>> Training hoàn tất!
>>> Lưu model vào ./output_qwen_embedding_finetuned...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

>>> Nén thành qwen_embedding_finetuned.zip...
