# Import

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torchmetrics.retrieval import RetrievalMAP
import os
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from tqdm import tqdm
import google.generativeai as genai
import os
import wandb
import ast
import subprocess
import pickle
import shutil
import datetime
import pytz
from huggingface_hub import HfApi, upload_folder
from datetime import datetime
import pytz

# PositiveExampleMining

In [2]:
class MiningPosExample:
    def __init__(self):
        self.data = None  # Khởi tạo thuộc tính self.data là None

     # Phương thức __len__ để trả về số dòng của data
    def __len__(self):
        if self.data is not None:
            return len(self.data)
        return 0  # Nếu self.data chưa được gán (None), trả về 0
    
    # Phương thức __getitem__ để truy xuất một dòng trong data theo chỉ số
    def __getitem__(self, index):
        if self.data is not None:
            return self.data.iloc[index]
        raise IndexError("Index out of range")  # Nếu self.data là None, raise lỗi
        
    def mining_pos_example(self, data_file):
        # Đọc dữ liệu từ file (giả sử là file CSV)
        df = pd.read_csv(data_file)
        df['jobtitles'] = df['jobtitles'].apply(ast.literal_eval)
        # Lấy các cột 'jobtitles' và 'skills_gen', sau đó "phân nổ" danh sách trong cột 'jobtitles'
        new_df = df[['jobtitles', 'skills_gen']].explode('jobtitles').rename(columns={'jobtitles': 'jobtitle', 'skills_gen': 'skill'})
        
        # Thêm cột 'label' với giá trị toàn bộ là 1
        new_df['label'] = 1
        
        # Lưu kết quả vào self.data
        self.data = new_df
        
        return new_df

    def get_data(self):
        return self.data

# NegativeExampleMining

In [3]:
class MiningNegExample:
    def __init__(self):
        self.data = None  # Khởi tạo thuộc tính self.data là None
    
     # Phương thức __len__ để trả về số dòng của data
    def __len__(self):
        if self.data is not None:
            return len(self.data)
        return 0  # Nếu self.data chưa được gán (None), trả về 0
    
    # Phương thức __getitem__ để truy xuất một dòng trong data theo chỉ số
    def __getitem__(self, index):
        if self.data is not None:
            return self.data.iloc[index]
        raise IndexError("Index out of range")  # Nếu self.data là None, raise lỗi

    def prepare_source_file(self, source_file):
        source_df = pd.read_csv(source_file)
        source_df['jobtitles'] = source_df['jobtitles'].apply(ast.literal_eval)
        # Lấy các cột 'jobtitles' và 'skills_gen', sau đó "phân nổ" danh sách trong cột 'jobtitles'
        source_df = source_df.explode('jobtitles').rename(columns={'jobtitles': 'jobtitle', 'skills_gen': 'skill', 'tid': 'gid'})
        source_df = source_df.reset_index(drop=True).reset_index(names='tid')

        return source_df
        
    def mining_neg_example(self, pair_file, source_file):
        # Đọc dữ liệu từ file (giả sử là file CSV)
        
        source_df = self.prepare_source_file(source_file)
        pair_df = pd.read_csv(pair_file)
        pair_df = pair_df.merge(source_df, left_on='q_id', right_on='tid')                
        pair_df = pair_df.merge(source_df, left_on='c_id', right_on='tid')
        pair_df = pair_df.rename(columns={'jobtitle_x': 'q_jobtitle', 'skill_x': 'q_skill', 'jobtitle_y': 'c_jobtitle', 'skill_y': 'c_skill'})
        query_pair = pair_df[['q_jobtitle', 'c_skill', 'label']].rename(columns={'q_jobtitle': 'jobtitle', 'c_skill': 'skill'})
        corpus_pair = pair_df[['c_jobtitle', 'q_skill', 'label']].rename(columns={'c_jobtitle': 'jobtitle', 'q_skill': 'skill'})
        neg_pair = pd.concat([query_pair, corpus_pair], axis=0)

        self.data = neg_pair
        return neg_pair
    
    def get_data(self):
        return self.data

# DataPrepare

In [4]:
class DataPreparer:
    def prepare_train_data(self, neg_pair_file, source_file):
        print("Đọc dữ liệu train_org:")
        
        pos_miner = MiningPosExample()
        pos_df = pos_miner.mining_pos_example(source_file)
        print(pos_df.head())

        neg_miner = MiningNegExample()
        neg_df = neg_miner.mining_neg_example(neg_pair_file, source_file)
        print(neg_df.head())

        train_df = pd.concat([pos_df, neg_df], axis=0)
        train_df = train_df.drop_duplicates(subset=None, keep='first', inplace=False)

        print(train_df.head())
        print(f"Cột dữ liệu: {train_df.columns}")
        print("Xuất dữ liệu train sau khi chuẩn bị:")
        train_file = "/kaggle/working/train.csv"
        utils.write_csv(train_df, train_file)
        return train_df, train_file

    def prepare_inference_data(self, corpus_path, queries_path, lang):
        print("Đọc dữ liệu inference:")
        corpus_df = utils.read_tsv(corpus_path)
        queries_df = utils.read_tsv(queries_path)
        
        print("Xuất dữ liệu inference:")
        corpus_out_path = f"/kaggle/working/corpus_{lang}.csv"
        queries_out_path = f"/kaggle/working/queries_{lang}.csv"
        utils.write_csv(corpus_df, corpus_out_path)
        utils.write_csv(queries_df, queries_out_path)
        return corpus_out_path, queries_out_path

# Dataset

In [5]:
class Dataset():
    def __init__(self, data):
        self.data = []
        for jobtitle, skill, label in data:
            self.data.append(InputExample(texts = [jobtitle, skill], label=label))

    def __len__(self) -> int:
        """Trả về số lượng mẫu trong dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    @staticmethod
    def load_train_data(train_path, train_data = None):
        if train_data is None:
            train_df = utils.read_csv(train_path)
        else: 
            train_df = train_data
        jobtitles =  train_df['jobtitle'].tolist()
        skills =  train_df['skill'].tolist()
        labels = train_df['label'].tolist()

        data = []
        for idx, jobtitle in enumerate(jobtitles):
            data.append((jobtitle, skills[idx], labels[idx]))
        return data
        
    @staticmethod  
    def load_inference_data(corpus_path, queries_path):
        corpus_df = utils.read_csv(corpus_path)
        queries_df = utils.read_csv(queries_path)
        
        cids_l = corpus_df['c_id'].tolist()
        corpus_l = corpus_df['jobtitle'].tolist()
        qids_l = queries_df['q_id'].tolist()
        queries_l = queries_df['jobtitle'].tolist()

        corpus = {"cid": cids_l,
                "jobtitle": corpus_l
                }

        queries = {"qid": qids_l,
                "jobtitle": queries_l
                }
        return corpus, queries

# BiEncoder

In [6]:
class BiEncoder:
    def __init__(self, model_name=None, model_path=None):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Device sử dụng:", device)
        
        try:
            if model_path is None:
                print(f"Tải mô hình từ Hugging Face với tên: {model_name}")
                self.model = SentenceTransformer(model_name)
            else:
                print(f"Tải mô hình từ đường dẫn cục bộ: {model_path}")
                self.model = SentenceTransformer(model_path)
            
            # Đặt mô hình lên thiết bị
            self.model = self.model.to(device)
            print("Mô hình đã được khởi tạo thành công!")
        
        except Exception as e:
            print(f"Lỗi khi khởi tạo mô hình: {e}")
            raise

# BiTrainer

In [7]:
class Trainer:
    def __init__(self, model_name, model_path=None):
        self.bi_encoder = BiEncoder(model_name, model_path)

    def train(self, dataset, loss, params):
        print("Khởi tạo dataset:")
        train_dataloader = DataLoader(dataset, shuffle=True, batch_size=32)

        print("Bắt đầu train: ")

         # Khởi tạo hàm mất mát
        train_loss = loss(self.bi_encoder.model)
        
        # Tạo thư mục nếu chưa có
        os.makedirs(params['output_path'], exist_ok=True)
        
        # Huấn luyện với callback
        self.bi_encoder.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=params['num_epochs'],
            warmup_steps=params['warmup_steps'],
            output_path=params["output_path"],
            scheduler=params['scheduler'],  # ← thay đổi ở đây
            optimizer_params=params['optimizer_params'],
            show_progress_bar=True
        )

        return self.bi_encoder.model, params["output_path"]

# Inference

In [8]:
class Inference:
    def __init__(self, model):
        self.model = model

    def embed(self, texts):
        print("Bắt đầu chạy embeddings...")
        texts_embedding = self.model.encode(texts)
        texts_embedding = torch.tensor(texts_embedding)

        return texts_embedding

    def infer(self, corpus, queries):    
        class SimilarityModel(nn.Module):
            def __init__(self, corpus_embeddings, corpus_cids):
                super(SimilarityModel, self).__init__()
                self.corpus_embeddings = corpus_embeddings  # 2D tensor of corpus embeddings
                self.corpus_cids = corpus_cids              # List of CIDs
        
            def forward(self, question_embedding):
                # Expand the question_embedding to match corpus_embeddings shape for cosine similarity calculation
                question_embedding = question_embedding.unsqueeze(0).expand(self.corpus_embeddings.size(0), -1)
                
                # Compute cosine similarity
                similarities = F.cosine_similarity(question_embedding, self.corpus_embeddings)
                similarities[similarities == 1] = float('-inf')

                # Get the top_n indices with the highest cosine similarity values
                sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
                
                
                # Return top_n_ids, sorted similarities, and sorted indices
                return sorted_similarities, sorted_indices
                
        # Example device setup
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

        # Initialize the similarity model
        corpus_embeddings = corpus["embedding"].to(device)
        cids = corpus['cid']

        query_embeddings = queries['embedding'].to(device)
        qids = queries['qid']
        
        similarity_model = SimilarityModel(corpus_embeddings, cids).to(device)
        if torch.cuda.device_count() > 1:
            similarity_model = nn.DataParallel(similarity_model)

        self.predictions = []
        for qid, query_embedding in tqdm(enumerate(query_embeddings), total=len(query_embeddings), desc="Processing queries"):
            # Convert question_embedding to tensor and move to the device
            query_embedding = query_embedding.to(device)
            
            # Get the top_n most relevant CIDs
            sorted_similarities, sorted_indices = similarity_model(query_embedding)
            results = []
            for idx in range(len(sorted_similarities)):
                doc_id = sorted_indices[idx].item()
                score = sorted_similarities[idx].item()
                rank = idx
                row = (qid + 1, "Q0", doc_id + 1, rank + 1, score, "4Huiter")
                results.append(row)
            self.predictions.append(results)
        return self.predictions


# RetrievalApp

In [9]:
import os
import zipfile
from datetime import datetime

class RetrievalApp:
    def __init__(self, model_name, model_path=None):
        self.model = BiEncoder(model_name, model_path).model
        print("Load mô hình.....")
        self.model_name = model_name
        self.model_path = model_path
    
    def prepare_data(self, data_file):
        """
        Chuẩn bị dữ liệu: chuẩn bị các corpus và queries cho từng ngôn ngữ.
        """
        print("Chuẩn bị data: ......")
        preparer = DataPreparer()
        corpus_file = dict()
        queries_file = dict()
        langs = list(data_file['corpus'].keys())
        
        for lang in langs:
            print(f"Chuẩn bị data {lang}:.....")
            corpus_file_org = data_file['corpus'][lang]
            queries_file_org = data_file['queries'][lang]
            corpus_file[lang], queries_file[lang] = preparer.prepare_inference_data(corpus_file_org, queries_file_org, lang)
        
        return langs, corpus_file, queries_file

    def inference(self, langs, corpus_file, queries_file):
        """
        Thực hiện inference cho từng ngôn ngữ.
        """
        print("Bắt đầu inference.....")
        corpus, queries = dict(), dict()
        for lang in langs:
            print(f"Load data {lang}:.....")
            corpus_file_cur = corpus_file[lang]
            queries_file_cur = queries_file[lang]
            corpus[lang], queries[lang] = Dataset.load_inference_data(corpus_file_cur, queries_file_cur)

        inferencer = Inference(self.model)
        for lang in langs:
            print(f"Inference {lang}:.....")
            corpus_cur, queries_cur = corpus[lang], queries[lang]
            corpus_cur['embedding'], queries_cur['embedding'] = inferencer.embed(corpus_cur['jobtitle']), inferencer.embed(queries_cur['jobtitle'])

        return corpus, queries, inferencer

    def predict(self, langs, corpus, queries, inferencer):
        """
        Thực hiện dự đoán.
        """
        print("Bắt đầu dự đoán:.....")
        predictions = dict()
        for lang in langs:
            print(f"Dự đoán {lang}:.....")
            corpus_cur, queries_cur = corpus[lang], queries[lang] 
            predictions[lang] = inferencer.infer(corpus_cur, queries_cur)

        return predictions

    def zip_directory(self, zip_filename, dir_name):
        """
        Nén thư mục thành file zip mà không sử dụng đa luồng.
        """
        print(f"Đang nén thư mục {dir_name} thành {zip_filename}...")
        with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
            # Duyệt qua tất cả các file trong thư mục và nén chúng tuần tự
            for root, dirs, files in os.walk(dir_name):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, dir_name)  # Lưu lại cấu trúc thư mục gốc
                    zipf.write(file_path, arcname)

        print(f"File zip đã được tạo: {zip_filename}")

    def save_predictions(self, langs, predictions):
        """
        Lưu kết quả dự đoán vào file và nén thư mục.
        """
        print("Bắt đầu xuất file:....")
        predictions_file = dict()
        folder_name = f"/kaggle/working/talent_clef/predict/{self.model_name}/{Timer.get()}"
        os.makedirs(folder_name, exist_ok=True)
        for lang in langs:
            predictions_file[lang] = utils.write_predictions(predictions[lang], folder_name, lang)
        
        # Nén thư mục sau khi xuất file
        zip_filename = folder_name + ".zip"
        self.zip_directory(zip_filename, folder_name)
        
        return predictions_file, zip_filename

    def evaluate(self, langs, predictions_file, data):
        """
        Đánh giá kết quả dự đoán.
        """
        print("Bắt đầu đánh giá:.....")
        ratings = dict()
        for lang in langs:
            print(f"Đánh giá {lang}:.....")
            run_file, qrels_file = predictions_file[lang], data['qrels'][lang]
            ratings[lang] = Evaluate.evaluate(run_file, qrels_file)
        return ratings

    def __call__(self, data_file):
        """
        Nối các hàm lại với nhau và chạy toàn bộ quy trình.
        """
        langs, corpus_file, queries_file = self.prepare_data(data_file)
        corpus, queries, inferencer = self.inference(langs, corpus_file, queries_file)
        predictions = self.predict(langs, corpus, queries, inferencer)
        predictions_file, zip_filename = self.save_predictions(langs, predictions)
        ratings = self.evaluate(langs, predictions_file, data_file)
        return ratings

# SetupEnvironment

In [10]:
from kaggle_secrets import UserSecretsClient

def load_git_workspace_wandb():
    user_secrets = UserSecretsClient()
    git_token = user_secrets.get_secret("github")
    wandp_api = user_secrets.get_secret("wandb")

    import subprocess

    # Thay {git_token} bằng token thực tế của bạn
    repo_url = f"https://hoivd:{git_token}@github.com/hoivd/talent_clef"
    
    # Lệnh git clone
    command = ["git", "clone", repo_url]
    
    try:
        # Chạy lệnh và đợi hoàn tất
        result = subprocess.run(command, check=True, text=True, capture_output=True)
        print("Clone thành công!")
        print("Stdout:", result.stdout)  # In stdout nếu có
        print("Stderr:", result.stderr)  # In stderr để thấy tiến trình
    except subprocess.CalledProcessError as e:
        print("Lỗi khi clone repository:")
        print(e.stderr)  # In thông báo lỗi nếu có
        # Đăng nhập W&B
    
    wandb.login(key=wandp_api)

# Utils

In [11]:
class utils:
    @staticmethod
    def read_csv(input_path, columns=None):
        print("Đọc csv file:")
        # Kiểm tra input_path ngay từ đầu
        if input_path is None:
            raise ValueError("input_path không được để trống (None). Vui lòng cung cấp đường dẫn file CSV.")
        
        try:  
            if columns is None:
                df = pd.read_csv(input_path, encoding='utf-8')
            else:
                df = pd.read_csv(input_path, encoding='utf-8', names=columns)
            print(f"Đọc dữ liệu từ {input_path} thành công")
            return df
        except Exception as e:
            print(f"Lỗi khi đọc dữ liệu từ {input_path}: {e}")
            raise


    @staticmethod
    def read_tsv(input_path, columns=None):
        print("Đọc tsv file:")
        # Kiểm tra input_path ngay từ đầu
        if input_path is None:
            raise ValueError("input_path không được để trống (None). Vui lòng cung cấp đường dẫn file TSV.")
        
        try:  
            df = None
            if columns is None:
                df = pd.read_csv(input_path, sep='\t', encoding='utf-8')  # Thêm sep='\t' cho TSV
            else:
                df = pd.read_csv(input_path, sep='\t', encoding='utf-8', names=columns)
            
            print(f"Đọc dữ liệu từ {input_path} thành công")
            print(df.head())
            return df
        except Exception as e:
            print(f"Lỗi khi đọc dữ liệu từ {input_path}: {e}")
            raise

    @staticmethod
    def write_csv(df, output_path):
        try:
            # Xuất ra file CSV
            df.to_csv(output_path, sep=',', encoding='utf-8', index=False)
            print(f"Đã xuất dữ liệu ra {output_path}")
        except Exception as e:
            print(f"Lỗi khi xuất file CSV: {e}")
            raise
        return output_path

    @staticmethod
    def write_predictions(predictions, folder_name, lang):
        
        output_path = f"{folder_name}/run_{lang}.trec"
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                for query_predict in predictions: 
                    for line in query_predict:  # rank bắt đầu từ 1
                        f.write(' '.join(str(x) for x in line) + '\n')
            print(f"Đã xuất file TREC ra {output_path}")  
            return output_path
        
        except Exception as e:
            print(f"Lỗi khi xuất file TREC: {e}")
            raise

        return output_path

# Evaluate

In [12]:
class Evaluate:
    @staticmethod
    def evaluate(predictions_path, qrels_path):
        command = ["python", "/kaggle/working/talentclef25_evaluation_script/talentclef_evaluate.py", "--qrels", qrels_path, "--run", predictions_path]
        result = subprocess.run(command, capture_output=True, text=True)
        print(result.stdout)

        return Evaluate.extract_metrics(result)
        
    @staticmethod
    def extract_metrics(result, language="en-en"):
        stdout = result.stdout
        map_value = float(stdout.split("map: ")[1].split("\n")[0])
        mrr = float(stdout.split("mrr: ")[1].split("\n")[0])
        ndcg = float(stdout.split("ndcg: ")[1].split("\n")[0])
        precision_5 = float(stdout.split("precision@5: ")[1].split("\n")[0])
        precision_10 = float(stdout.split("precision@10: ")[1].split("\n")[0])
        precision_100 = float(stdout.split("precision@100: ")[1].split("\n")[0])
    
        metrics = {
            "map": map_value,
            "mrr": mrr,
            "ndcg": ndcg,
            "precision@5": precision_5,
            "precision@10": precision_10,
            "precision@100": precision_100
        }
        return metrics     

# Timer

In [13]:
class Timer:
    @staticmethod
    def get():
        # Lấy múi giờ Việt Nam (UTC+7)
        vietnam_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
        
        # Lấy thời gian hiện tại ở UTC
        utc_now = datetime.now(pytz.utc)
        
        # Chuyển thời gian UTC sang múi giờ Việt Nam
        vietnam_time = utc_now.astimezone(vietnam_timezone)
        
        # Trả về thời gian đã định dạng theo kiểu YYYY-MM-DD HH:MM:SS
        return vietnam_time.strftime("%m-%d_%H-%M-%S")

# Gọi hàm và in kết quả
print(Timer.get())

04-25_00-38-02


# ModelLogger

In [14]:
class ModelLogger:
    def __init__(self, model_name, loss_function, num_epochs, metrics, notes="", training_time=None,
                 folder="/kaggle/working/talent_clef/results", file_name=f"model_results.csv"):
        self.model_name = model_name
        self.loss = loss_function
        self.epochs = num_epochs
        self.metrics = metrics
        self.notes = notes
        self.training_time = training_time
        self.folder = folder
        self.file_path = os.path.join(folder, file_name)
    
    def compute_average_map(self):
        map_values = [lang['map'] for lang in self.metrics.values() if 'map' in lang]
        return sum(map_values) / len(map_values) if map_values else None

    def to_dict(self):
        return {
            "model_name": [self.model_name],
            "Avg result": [self.compute_average_map()],
            "en-en result": [self.metrics.get("en-en", "")],
            "es-es result": [self.metrics.get("es-es", "")],
            "de-de result": [self.metrics.get("de-de", "")],
            "zh-zh result": [self.metrics.get("zh-zh", "")],
            "en-es result": [self.metrics.get("en-es", "")],
            "en-de result": [self.metrics.get("en-de", "")],
            "en-zh result": [self.metrics.get("en-zh", "")],
            "loss": [self.loss],
            "epochs": [self.epochs],
            "training_time (s)": [self.training_time],
            "date": [Timer.get()],
            "notes": [self.notes]
        }

    def save(self):
        # Tạo thư mục nếu chưa tồn tại
        os.makedirs(self.folder, exist_ok=True)

        # Tạo DataFrame từ dict
        df_new = pd.DataFrame(self.to_dict())

        if os.path.exists(self.file_path):
            df_existing = pd.read_csv(self.file_path)
            df_updated = pd.concat([df_existing, df_new], ignore_index=True)
            df_updated.to_csv(self.file_path, index=False)
            print(f"✅ Đã thêm dữ liệu vào file: {self.file_path}")
        else:
            df_new.to_csv(self.file_path, index=False)
            print(f"✅ Đã tạo file mới: {self.file_path}")

    def show_log(self):
        if os.path.exists(self.file_path):
            print(f"\n📄 Nội dung file log:")
            log_df = utils.read_csv(self.file_path)
            print(log_df)
        else:
            print("⚠️ Chưa có file log để hiển thị.")

# Hàm thực thi

## 1. Clone data

In [15]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
huggingface_api = user_secrets.get_secret("huggingface")

!huggingface-cli login --token {huggingface_api}
!git clone https://huggingface.co/hoivinh20789/talent_clef /kaggle/working/models
load_git_workspace_wandb()

!git clone https://github.com/TalentCLEF/talentclef25_evaluation_script.git
!pip install -r "/kaggle/working/talentclef25_evaluation_script/requirements.txt"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `kaggle` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `kaggle`
Cloning into '/kaggle/working/models'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 93 (delta 29), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (93/93), 632.43 KiB | 2.69 MiB/s, done.
Filtering content: 100% (15/15), 2.33 GiB | 119.70 MiB/s, done.


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Clone thành công!
Stdout: 
Stderr: Cloning into 'talent_clef'...
Updating files:  70% (78/110)
Updating files:  71% (79/110)
Updating files:  72% (80/110)
Updating files:  73% (81/110)
Updating files:  74% (82/110)
Updating files:  75% (83/110)
Updating files:  76% (84/110)
Updating files:  77% (85/110)
Updating files:  78% (86/110)
Updating files:  79% (87/110)
Updating files:  80% (88/110)
Updating files:  81% (90/110)
Updating files:  82% (91/110)
Updating files:  83% (92/110)
Updating files:  84% (93/110)
Updating files:  85% (94/110)
Updating files:  86% (95/110)
Updating files:  87% (96/110)
Updating files:  88% (97/110)
Updating files:  89% (98/110)
Updating files:  90% (99/110)
Updating files:  91% (101/110)
Updating files:  92% (102/110)
Updating files:  93% (103/110)
Updating files:  94% (104/110)
Updating files:  95% (105/110)
Updating files:  96% (106/110)
Updating files:  97% (107/110)
Updating files:  98% (108/110)
Updating files:  99% (109/110)
Updating files: 100% (110/

[34m[1mwandb[0m: Currently logged in as: [33mhoivinh20789[0m ([33mhoivinh20789-uit[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Cloning into 'talentclef25_evaluation_script'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 27 (delta 10), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 10.10 KiB | 10.10 MiB/s, done.
Resolving deltas: 100% (10/10), done.
Collecting ranx (from -r /kaggle/working/talentclef25_evaluation_script/requirements.txt (line 2))
  Downloading ranx-0.3.20-py3-none-any.whl.metadata (17 kB)
Collecting ir-datasets (from ranx->-r /kaggle/working/talentclef25_evaluation_script/requirements.txt (line 2))
  Downloading ir_datasets-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting lz4 (from ranx->-r /kaggle/working/talentclef25_evaluation_script/requirements.txt (line 2))
  Downloading lz4-4.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting cbor2 (from ranx->-r /kaggle/working/talentclef25_evaluation_scr

## 2. Training

In [16]:
source_file = "/kaggle/working/talent_clef/data/gen_data/train_data_gen.csv"
neg_pair_file = "/kaggle/working/talent_clef/data/random_neg_sample/random_neg_example.csv"
preparer = DataPreparer()
train_df, train_file = preparer.prepare_train_data(neg_pair_file, source_file)

Đọc dữ liệu train_org:
                       jobtitle  \
0    director of technical arts   
0          technical supervisor   
0             technical manager   
0  head of technical department   
0            technical director   

                                               skill  label  
0  Overview: The essential skills for performing ...      1  
0  Overview: The essential skills for performing ...      1  
0  Overview: The essential skills for performing ...      1  
0  Overview: The essential skills for performing ...      1  
0  Overview: The essential skills for performing ...      1  
                     jobtitle  \
0  director of technical arts   
1  director of technical arts   
2  director of technical arts   
3  director of technical arts   
4  director of technical arts   

                                               skill  label  
0  Overview: The skills landscape encompasses a r...      0  
1  Overview: The performing arts demand a diverse...      0  
2  Overvi

In [17]:
train_data = Dataset.load_train_data(train_file, train_df)
dataset = Dataset(train_data)

model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model_path = '/kaggle/working/models/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/pos_epoch6_random_neg_pair_epoch8_symetric_loss'
loss = losses.MultipleNegativesSymmetricRankingLoss
detail = "pos_epoch6_random_neg_pair_epoch14_symetric_loss"
params = {
    "num_epochs": 6,
    "output_path": f"/kaggle/working/models/{model_name}/{detail}",
    "warmup_steps": 100,
    "scheduler":'WarmupCosine',  # ← thay đổi ở đây
    "optimizer_params":{'lr': 5e-5},  # ← tăng nhẹ learning rate
}

trainer = Trainer(model_name, model_path)
model, model_path = trainer.train(dataset.data, loss, params)

Device sử dụng: cuda
Tải mô hình từ đường dẫn cục bộ: /kaggle/working/models/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/pos_epoch6_random_neg_pair_epoch8_symetric_loss
Mô hình đã được khởi tạo thành công!
Khởi tạo dataset:
Bắt đầu train: 


[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250424_174139-1anxgke0[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcheckpoints/model[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/hoivinh20789-uit/sentence-transformers[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/hoivinh20789-uit/sentence-transformers/runs/1anxgke0[0m


Step,Training Loss
500,0.3903
1000,0.4137
1500,0.4357
2000,0.4509
2500,0.4534
3000,0.4543
3500,0.4594
4000,0.4587
4500,0.4633
5000,0.4711


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

## 3. Push Model

In [18]:
print(model_path)

/kaggle/working/models/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/pos_epoch6_random_neg_pair_epoch14_symetric_loss


In [19]:
repo_id = "hoivinh20789/talent_clef"  # Đường dẫn tới repo của bạn trên Hugging Face
folder_path = params['output_path']  # Đường dẫn tới thư mục chứa mô hình của bạn
target_folder = f"{model_name}/{detail}"  # Thư mục con mà bạn muốn đẩy mô hình vào trong repo

api = HfApi()

# Đẩy model vào thư mục 'model' trong repository
api.upload_folder(
    repo_id=repo_id,
    folder_path=folder_path,  # Thư mục chứa các file model
    path_in_repo=target_folder,  # Thư mục con 'model' trong repo
    commit_message="Upload model version 1",
)


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hoivinh20789/talent_clef/commit/2e2a555070049d9c10ae99b0dde11263f296a097', commit_message='Upload model version 1', commit_description='', oid='2e2a555070049d9c10ae99b0dde11263f296a097', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hoivinh20789/talent_clef', endpoint='https://huggingface.co', repo_type='model', repo_id='hoivinh20789/talent_clef'), pr_revision=None, pr_num=None)

## 4. RetrievalApp

In [20]:
print(model_path)

/kaggle/working/models/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/pos_epoch6_random_neg_pair_epoch14_symetric_loss


In [21]:
data = {
    "corpus": {
        "en-en": "/kaggle/working/talent_clef/data/TaskA/validation/english/corpus_elements"
        # "de-de": "/kaggle/working/talent_clef/data/TaskA/validation/german/corpus_elements",
        # "es-es": "/kaggle/working/talent_clef/data/TaskA/validation/spanish/corpus_elements",
        # "zh-zh": "/kaggle/working/talent_clef/data/TaskA/validation/chinese/corpus_elements"
    },
    
    "queries":{
        "en-en": "/kaggle/working/talent_clef/data/TaskA/validation/english/queries"
        # "de-de": "/kaggle/working/talent_clef/data/TaskA/validation/german/queries",
        # "es-es": "/kaggle/working/talent_clef/data/TaskA/validation/spanish/queries",
        # "zh-zh": "/kaggle/working/talent_clef/data/TaskA/validation/chinese/queries"
    },
    
    "qrels": {
        "en-en": "/kaggle/working/talent_clef/data/TaskA/validation/english/qrels.tsv"
    #     "de-de": "/kaggle/working/talent_clef/data/TaskA/validation/german/qrels.tsv",
    #     "es-es": "/kaggle/working/talent_clef/data/TaskA/validation/spanish/qrels.tsv",
    #     "zh-zh": "/kaggle/working/talent_clef/data/TaskA/validation/chinese/qrels.tsv"
    }
}

model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
app = RetrievalApp(model_name, model_path)
ratings = app(data)

Device sử dụng: cuda
Tải mô hình từ đường dẫn cục bộ: /kaggle/working/models/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/pos_epoch6_random_neg_pair_epoch14_symetric_loss
Mô hình đã được khởi tạo thành công!
Load mô hình.....
Chuẩn bị data: ......
Chuẩn bị data en-en:.....
Đọc dữ liệu inference:
Đọc tsv file:
Đọc dữ liệu từ /kaggle/working/talent_clef/data/TaskA/validation/english/corpus_elements thành công
   c_id                          jobtitle
0     1                recording engineer
1     2              director of taxation
2     3  technical support representative
3     4                        hr manager
4     5           computer graphic artist
Đọc tsv file:
Đọc dữ liệu từ /kaggle/working/talent_clef/data/TaskA/validation/english/queries thành công
   q_id             jobtitle
0     1                nanny
1     2    food technologist
2     3   broadcast engineer
3     4  automation engineer
4     5         veterinarian
Xuất dữ liệu inference:
Đã xuất dữ liệu ra

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

Bắt đầu chạy embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Bắt đầu dự đoán:.....
Dự đoán en-en:.....


Processing queries: 100%|██████████| 105/105 [00:07<00:00, 14.01it/s]


Bắt đầu xuất file:....
Đã xuất file TREC ra /kaggle/working/talent_clef/predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21/run_en-en.trec
Đang nén thư mục /kaggle/working/talent_clef/predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21 thành /kaggle/working/talent_clef/predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21.zip...
File zip đã được tạo: /kaggle/working/talent_clef/predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21.zip
Bắt đầu đánh giá:.....
Đánh giá en-en:.....
Received parameters:
  qrels: /kaggle/working/talent_clef/data/TaskA/validation/english/qrels.tsv
  run: /kaggle/working/talent_clef/predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21/run_en-en.trec
Loading qrels...
Loading run...
Running evaluation...

=== Evaluation Results ===
map: 0.5290
mrr: 0.8842
ndcg: 0.7978
precision@5: 0.7067
precision@10: 0.6019
precis

In [22]:
ratings

{'en-en': {'map': 0.529,
  'mrr': 0.8842,
  'ndcg': 0.7978,
  'precision@5': 0.7067,
  'precision@10': 0.6019,
  'precision@100': 0.1748}}

## 5. Log Model

In [23]:
logger = ModelLogger(
    model_name=model_name,
    loss_function=None,
    num_epochs=None,
    metrics=ratings,
    notes="thêm negative pair và finetune tiếp từ epoch 5"
)

logger.save()
logger.show_log()

✅ Đã tạo file mới: /kaggle/working/talent_clef/results/model_results.csv

📄 Nội dung file log:
Đọc csv file:
Đọc dữ liệu từ /kaggle/working/talent_clef/results/model_results.csv thành công
                                          model_name  Avg result  \
0  sentence-transformers/paraphrase-multilingual-...       0.529   

                                        en-en result  es-es result  \
0  {'map': 0.529, 'mrr': 0.8842, 'ndcg': 0.7978, ...           NaN   

   de-de result  zh-zh result  en-es result  en-de result  en-zh result  loss  \
0           NaN           NaN           NaN           NaN           NaN   NaN   

   epochs  training_time (s)            date  \
0     NaN                NaN  04-25_04-37-14   

                                            notes  
0  thêm negative pair và finetune tiếp từ epoch 5  


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


# Git Push

In [24]:
cd talent_clef

/kaggle/working/talent_clef


In [25]:
brand_name = model_name + "/" +Timer.get()

In [26]:
brand_name

'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-37-14'

In [27]:
!git config --global user.email "hoivd79@gmail.com"
!git config --global user.name "Dang Vinh Hoi"
!git checkout -b {brand_name}      # Tạo và chuyển sang nhánh dev
!git status
!git add .
!git commit -m "updated"
!git push -u origin {brand_name}    # Push lần đầu, thiết lập tracking

Switched to a new branch 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-37-14'
On branch sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-37-14
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mpredict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21.zip[m
	[31mpredict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21/[m
	[31mresults/[m

nothing added to commit but untracked files present (use "git add" to track)
[sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-37-14 249bccf] updated
 3 files changed, 274997 insertions(+)
 create mode 100644 predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21.zip
 create mode 100644 predict/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/04-25_04-36-21/run_en-en.trec
 create mode 100644 results/model_results.csv
Enumerating objects: 14, done

In [28]:
cd ..


/kaggle/working
