In [60]:
import json
from collections import OrderedDict
import logging
import os
import pickle
import pathlib
temp = pathlib.PosixPath
pathlib.PosixPath = pathlib.WindowsPath

import torch
from torch import nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from datasets import Dataset, load_dataset

from model.model import SentenceTransformersWrapperForLM

# model

In [9]:
# extrac encoder from SentenceTransformersWrapperForLM
model_load_path = 'E:\\OneDrive - Hanoi University of Science and Technology\\Chuyen nganh\\Deep Learning and Its Applications\\BTL\\save\\models\\1211_113344\\checkpoint-epoch1.pth'
stf_wrapper = SentenceTransformersWrapperForLM(
    model_name='all-MiniLM-L6-v2',
    model_path=None,
    hidden_size=512,
    dropout=0.1,
    vocab_size=32000,
    load_path=model_load_path
)
sentence_transformers_model = stf_wrapper.model



Loaded model from E:\OneDrive - Hanoi University of Science and Technology\Chuyen nganh\Deep Learning and Its Applications\BTL\save\models\1211_113344\checkpoint-epoch1.pth


In [13]:
model_save_path = 'E:\\OneDrive - Hanoi University of Science and Technology\\Chuyen nganh\\Deep Learning and Its Applications\\BTL\\save\\models\\1211_113344\\sentence_transformers_model.pth'
torch.save(sentence_transformers_model.state_dict(), model_save_path)

In [2]:
COLLECTION_PATH = 'E:\\OneDrive - Hanoi University of Science and Technology\\Chuyen nganh\\Deep Learning and Its Applications\\BTL\\data\\MSMACRO\\passage_ranking_dataset\\collection.tsv'

In [32]:
class BERTBiEncoder(nn.Module):
    def __init__(
        self,
        query_encoder_name,
        query_encoder_path,
        passage_encoder_name,
        passage_encoder_path,
        vocab_size,
        load_path=None,
        query_encoder_load_path=None,
        passage_encoder_load_path=None,
    ):
        super(BERTBiEncoder, self).__init__()
        
        self.query_encoder_name = query_encoder_name
        self.query_encoder_path = query_encoder_path
        self.passage_encoder_name = passage_encoder_name
        self.passage_encoder_path = passage_encoder_path
        self.vocab_size = vocab_size

        if query_encoder_name:
            self.query_encoder = SentenceTransformer(query_encoder_name)
        elif query_encoder_path:
            self.query_encoder = SentenceTransformer(query_encoder_path)
        self._extend_encoder_vocab_size(encoder=self.query_encoder, new_vocab_size=vocab_size)
        
        if passage_encoder_name:
            self.passage_encoder = SentenceTransformer(passage_encoder_name)
        elif passage_encoder_path:
            self.passage_encoder = SentenceTransformer(passage_encoder_path)
        self._extend_encoder_vocab_size(encoder=self.passage_encoder, new_vocab_size=vocab_size)
        
        if load_path is not None:
            torch_model_checkpoint = torch.load(load_path, map_location='cpu')
            self.load_state_dict(torch_model_checkpoint['state_dict'])
            print('Loaded model from', load_path)
        if query_encoder_load_path is not None:
            torch_model_checkpoint = torch.load(query_encoder_load_path, map_location='cpu')
            self.query_encoder.load_state_dict(torch_model_checkpoint['state_dict'])
            print('Loaded query encoder from', query_encoder_load_path)
        if passage_encoder_load_path is not None:
            torch_model_checkpoint = torch.load(passage_encoder_load_path, map_location='cpu')
            self.passage_encoder.load_state_dict(torch_model_checkpoint['state_dict'])
            print('Loaded model from', passage_encoder_load_path)

    def forward(self, query_features, passage_features):
        query_embeddings = self.query_encoder.forward(query_features)['sentence_embedding']
        passage_embeddings = self.passage_encoder.forward(passage_features)['sentence_embedding']
        scores = torch.stack([F.cosine_similarity(q_emb, passage_embeddings) for q_emb in query_embeddings])
        outputs = {
            'scores': scores,
            'query_embeddings': query_embeddings,
            'passage_embeddings': passage_embeddings
        }
        return outputs

    def _extend_encoder_vocab_size(self, encoder: SentenceTransformer, new_vocab_size=None):
        '''
        Update model embedding layer for larger vocabulary, keeps all the trained embeddings
        '''
        if not new_vocab_size:
            new_vocab_size = self.vocab_size
        old_embedding_layer = encoder._first_module().auto_model.embeddings.word_embeddings
        old_vocab_size = old_embedding_layer.weight.shape[0]
        embedding_size = old_embedding_layer.weight.shape[1]
        new_embedding_layer = nn.Embedding(num_embeddings=new_vocab_size, embedding_dim=embedding_size)
        new_embedding_layer.weight.data[:old_vocab_size] = old_embedding_layer.weight.data
        encoder._first_module().auto_model.embeddings.word_embeddings = new_embedding_layer        

In [15]:
def read_specific_line_of_file(file_path, line_number):
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            if i == line_number:
                return line.strip()
    return None        

In [None]:
def constrastive_loss(scores, tau=1):
    '''
    Loss for constrastive learning
    Score for positive sample always at first
    '''
    exp_scores = torch.exp(scores/tau)
    loss = -torch.log(exp_scores[0]/exp_scores.sum())
    return loss

In [24]:
input_features = {
    'input_ids': torch.tensor([[1,2,3,4], [2,3,4,5]]),
    'attention_mask': torch.tensor([[1,1,1,1], [1,1,1,1]]),
    'token_type_ids': torch.tensor([[0,0,0,0], [0,0,0,0]])
}
out_features = sentence_transformers_model.forward(input_features)
embeddings = []
for sent_idx in range(len(out_features['sentence_embedding'])):
    row = {name: out_features[name][sent_idx] for name in out_features}
    embeddings.append(row)

In [58]:
biencoder = BERTBiEncoder(
    query_encoder_name='all-MiniLM-L6-v2',
    query_encoder_path=None,
    passage_encoder_name='all-MiniLM-L6-v2',
    passage_encoder_path=None,
    vocab_size=32000,
    load_path=None,
    query_encoder_load_path=None,
    passage_encoder_load_path=None,
)
q_input_features = {
    'input_ids': torch.tensor([[1,3200,3,4], [2,3,11,5]]),
    'attention_mask': torch.tensor([[1,1,1,1], [1,1,1,1]]),
    'token_type_ids': torch.tensor([[0,0,0,0], [0,0,0,0]])
}
p_input_features = {
    'input_ids': torch.tensor([[1,0,3,4], [2,1,4,5]]),
    'attention_mask': torch.tensor([[1,1,1,1], [1,1,1,1]]),
    'token_type_ids': torch.tensor([[0,0,0,0], [0,0,0,0]])
}
output = biencoder.forward(q_input_features, p_input_features)
output['scores']

tensor([[0.7553, 0.7285],
        [0.8283, 0.9175]], grad_fn=<StackBackward0>)

In [46]:
output['query_embeddings'].shape

torch.Size([2, 384])

In [47]:
output['passage_embeddings']

torch.Size([2, 384])

In [59]:
q = torch.tensor([[1,2,3, 3,4], [2,3, 3,11,5]], dtype=torch.float32)
p = torch.tensor([[1,0,3,4], [2,1,4,5]], dtype=torch.float32)
torch.stack([F.cosine_similarity(q_emb, p) for q_emb in q])

RuntimeError: The size of tensor a (5) must match the size of tensor b (4) at non-singleton dimension 1

# dataloader

In [7]:
with open('E:\\OneDrive - Hanoi University of Science and Technology\\Chuyen nganh\\Deep Learning and Its Applications\\BTL\\data\\MSMACRO\\passage_ranking_dataset\\train_qrels_first_100k_with_negatives.pkl', 'rb') as f:
    train_qrels = pickle.load(f)

In [None]:
class MSMarcoConstrastiveLearningDataset(Dataset):
    '''
    MS Marco dataset for constrastive learning
    '''

    def __init__(
        self,
        qrels_path,
    )
    with open(qrels_path, 'rb') as f:
        self.qrels = pickle.load(f) 

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, index: int):
        # Tokenize the data and perform other preprocessing
        return self.dataset[]