In [1]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu123

Looking in indexes: https://download.pytorch.org/whl/cu123Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import torch
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
corpus_df = pd.read_csv('corpus.csv')
train_df = pd.read_csv('train.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_df['cid'] = train_df['cid'].apply(lambda x: list(map(int, x.strip('[]').split())))

In [None]:
import torch
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers.util import cos_sim

class CustomDataset(Dataset):
    def __init__(self, train_df, corpus_df, model, max_length=128):
        self.train_df = train_df
        self.corpus_df = corpus_df
        self.model = model
        self.max_length = max_length

    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, idx):
        row = self.train_df.iloc[idx]
        question = row['question']
        qid = row['qid']
        positive_cids = row['cid']  # Danh sách CID tích cực

        # Lấy corpus tích cực
        positive_corpus = self.corpus_df[self.corpus_df['cid'].isin(positive_cids)]['text'].tolist()

        # Chọn hard negatives
        hard_negatives = self.get_hard_negatives(positive_cids, positive_corpus)

        # Encode câu hỏi và các corpus
        query_enc = self.model.encode(question, convert_to_tensor=True)
        positive_encs = [self.model.encode(pos, convert_to_tensor=True) for pos in positive_corpus]
        negative_encs = self.model.encode(hard_negatives, convert_to_tensor=True)

        return {
            'query': query_enc,
            'positives': positive_encs,  # Trả về danh sách embeddings tích cực
            'negatives': negative_encs,
        }

    def get_hard_negatives(self, positive_cids, positive_corpus, top_k=10):
        # Lấy tất cả các CID không phải là positive
        all_cids = set(self.corpus_df['cid'].tolist())
        negative_cids = list(all_cids - set(positive_cids))

        # Encode các corpus tích cực
        positive_embeddings = self.model.encode(positive_corpus, convert_to_tensor=True)

        # Lấy corpus của các negative cids
        negative_corpus = self.corpus_df[self.corpus_df['cid'].isin(negative_cids)]['text'].tolist()

        # Encode các corpus negative
        negative_embeddings = self.model.encode(negative_corpus, convert_to_tensor=True)

        similarities = cos_sim(positive_embeddings, negative_embeddings)

        # Chọn top k hard negatives (có độ tương đồng cao nhất)
        top_hard_negatives = []
        for sim in similarities:
            # Lấy các chỉ số của top k negative corpus có độ tương đồng cao nhất
            top_k_idx = np.argsort(sim.cpu().numpy())[-top_k:]
            top_hard_negatives.extend([negative_corpus[idx] for idx in top_k_idx])

        return top_hard_negatives

# Sửa lại loss function
def contrastive_loss(query_embeddings, positive_embeddings_list, negative_embeddings, margin=1.0):
    total_pos_loss = 0
    for positive_embeddings in positive_embeddings_list:
        pos_loss = F.pairwise_distance(query_embeddings, positive_embeddings, p=2)
        total_pos_loss += pos_loss

    # Tính trung bình loss cho tất cả positives
    avg_pos_loss = total_pos_loss / len(positive_embeddings_list)
    neg_loss = F.pairwise_distance(query_embeddings, negative_embeddings, p=2)

    # Contrastive loss
    loss = F.relu(avg_pos_loss - neg_loss + margin).mean()
    return loss

from sentence_transformers import SentenceTransformer, models

# Load PhoBERT và cấu hình mean pooling
word_embedding_model = models.Transformer("vinai/phobert-base", max_seq_length=128)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



# Tạo Dataset và DataLoader
dataset = CustomDataset(train_df, corpus_df, model)
dataloader = DataLoader(dataset, batch_size=2)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training Loop đã sửa
for epoch in range(5):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        query_embeddings = batch['query'].to(device)
        positive_embeddings_list = [emb.to(device) for emb in batch['positives']]
        negative_embeddings = batch['negative'].to(device)

        # Tính loss
        loss = contrastive_loss(query_embeddings, positive_embeddings_list, negative_embeddings)
        epoch_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {epoch_loss / len(dataloader):.4f}")


