In [1]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu123

Looking in indexes: https://download.pytorch.org/whl/cu123
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
corpus_df = pd.read_csv('corpus.csv')
train_df = pd.read_csv('train.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Đầu tiên, loại bỏ dấu ngoặc vuông và chuyển từng phần tử thành int
train_df['cid'] = train_df['cid'].apply(lambda x: list(map(int, x.strip('[]').split())))

In [4]:
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class CustomDataset(Dataset):
    def __init__(self, train_df, corpus_df, tokenizer, model, max_length=128):
        self.train_df = train_df
        self.corpus_df = corpus_df
        self.tokenizer = tokenizer
        self.model = model
        self.max_length = max_length
        self.device = model.device  # Giả sử model đã được chuyển sang GPU

    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, idx):
        row = self.train_df.iloc[idx]
        question = row['question']
        qid = row['qid']
        positive_cids = row['cid']

        # Lấy corpus tích cực
        positive_corpus = self.corpus_df[self.corpus_df['cid'].isin(positive_cids)]['text'].tolist()
        
        # Chọn hard negatives
        hard_negatives = self.get_hard_negatives(positive_cids, positive_corpus)

        # Tokenize câu hỏi và các corpus
        query_enc = self.tokenizer(question, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt").to(self.device)
        positive_enc = self.tokenizer(positive_corpus, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt").to(self.device)
        negative_enc = self.tokenizer(hard_negatives, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt").to(self.device)

        return {
            'query': query_enc['input_ids'].squeeze(0),
            'positive': positive_enc['input_ids'].squeeze(0),
            'negative': negative_enc['input_ids'].squeeze(0),
            'query_attention': query_enc['attention_mask'].squeeze(0),
            'positive_attention': positive_enc['attention_mask'].squeeze(0),
            'negative_attention': negative_enc['attention_mask'].squeeze(0),
        }

    def get_hard_negatives(self, positive_cids, positive_corpus, top_k=10):
        # Lấy tất cả các CID không phải là positive
        all_cids = set(self.corpus_df['cid'].tolist())
        negative_cids = list(all_cids - set(positive_cids))

        # Encode các corpus tích cực
        positive_embeddings = self.encode_corpus(positive_corpus)

        # Lấy corpus của các negative cids
        negative_corpus = self.corpus_df[self.corpus_df['cid'].isin(negative_cids)]['text'].tolist()

        # Encode các corpus negative
        negative_embeddings = self.encode_corpus(negative_corpus)

        # Tính cosine similarity giữa positive embeddings và negative embeddings
        similarities = self.compute_cosine_similarity(positive_embeddings, negative_embeddings)

        # Chọn top k hard negatives (có độ tương đồng cao nhất)
        top_hard_negatives = []
        for sim in similarities:
            # Lấy các chỉ số của top k negative corpus có độ tương đồng cao nhất
            top_k_idx = torch.argsort(sim, descending=True)[:top_k]  # Sắp xếp và lấy top k
            top_hard_negatives.extend([negative_corpus[idx] for idx in top_k_idx])

        return top_hard_negatives

    def encode_corpus(self, corpus):
        # Encode một list các corpus sử dụng model
        inputs = self.tokenizer(corpus, padding=True, truncation=True, return_tensors="pt").to(self.device)
        with torch.no_grad():
            # Lấy embedding của [CLS] token
            embeddings = self.model(**inputs).last_hidden_state[:, 0, :].to(self.device)
        return embeddings

    def compute_cosine_similarity(self, positive_embeddings, negative_embeddings):
        # Tính cosine similarity trên GPU giữa positive và negative embeddings
        # Đảm bảo rằng embeddings đều trên cùng một thiết bị (GPU)
        positive_embeddings = positive_embeddings.to(self.device)
        negative_embeddings = negative_embeddings.to(self.device)

        # Tính cosine similarity: F.cosine_similarity giữa các embeddings
        similarities = F.cosine_similarity(positive_embeddings.unsqueeze(1), negative_embeddings.unsqueeze(0), dim=-1)
        return similarities

# Load tokenizer và model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base")  # Hoặc sử dụng model khác nếu cần
# Training loop
device = "cuda"
model.to(device)

# Tạo Dataset và DataLoader
dataset = CustomDataset(train_df, corpus_df, tokenizer, model)
dataloader = DataLoader(dataset, batch_size=2)

# Loss function cho contrastive learning
def contrastive_loss(query_embeddings, positive_embeddings, negative_embeddings, margin=1.0):
    pos_loss = F.pairwise_distance(query_embeddings, positive_embeddings, p=2)
    neg_loss = F.pairwise_distance(query_embeddings, negative_embeddings, p=2)
    loss = F.relu(pos_loss - neg_loss + margin).mean()
    return loss

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)



for epoch in range(5):
    model.train()
    for batch in dataloader:
        query_ids = batch['query'].to(device)  # Ensure on the correct device
        query_attention = batch['query_attention'].to(device)  # Ensure on the correct device
        positive_ids = batch['positive'].to(device)  # Ensure on the correct device
        positive_attention = batch['positive_attention'].to(device)  # Ensure on the correct device
        negative_ids = batch['negative'].to(device)  # Ensure on the correct device
        negative_attention = batch['negative_attention'].to(device)  # Ensure on the correct device

        query_embeddings = model(query_ids, query_attention).last_hidden_state[:, 0, :].to(device)
        positive_embeddings = model(positive_ids, positive_attention).last_hidden_state[:, 0, :].to(device)
        negative_embeddings = model(negative_ids, negative_attention).last_hidden_state[:, 0, :].to(device)

        # Tính loss
        loss = contrastive_loss(query_embeddings, positive_embeddings, negative_embeddings)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
