In [1]:
import torch
import torch.nn as nn
from transformers import XLMRobertaModel, XLMRobertaTokenizer

class BGE_M3_Model(nn.Module):
    def __init__(self):
        super(BGE_M3_Model, self).__init__()
        
        # Pretrained XLM-RoBERTa Model
        self.model = XLMRobertaModel.from_pretrained("D:/LLMs/xlm_roberta_large")

        # Tokenizer
        self.tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
        
        # Custom Pooler Layer
        self.pooler = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.Tanh()
        )
    
    def forward(self, input_ids, attention_mask):
        # Get hidden states from XLM-RoBERTa
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        print(f">>> outputs: {outputs}")
        print("")
        
        # Extract the [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        print(f">>> cls_output: {cls_output}")
        print("")
        
        # Apply Pooler
        pooled_output = self.pooler(cls_output)
        print(f">>> pooled_output: {pooled_output}")
        print("")
        
        return pooled_output
    
    def tokenize(self, text):
        encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        return encoding.input_ids, encoding.attention_mask
    
    def encode(self, text):
        input_ids, attention_mask = self.tokenize(text)
        with torch.no_grad():
            return self.forward(input_ids, attention_mask)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Example Usage
model = BGE_M3_Model()
model


BGE_M3_Model(
  (model): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [3]:
text1 = "The capital of japan is Tokyo"
embedding1 = model.encode(text1)
print(embedding1.shape, type(embedding1))  # Should print torch.Size([1, 1024])

>>> outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0364, -0.0348,  0.1285,  ..., -0.0790, -0.0497, -0.0362],
         [-0.2560, -0.0290,  0.1498,  ...,  0.1685, -0.3806, -0.4544],
         [ 0.0447, -0.0274,  0.1715,  ..., -0.0855, -0.3198,  0.2792],
         ...,
         [-0.0324, -0.1836,  0.2288,  ..., -0.2997,  0.1543, -0.1152],
         [ 0.3107,  0.0774,  0.0876,  ..., -0.1311, -0.1208,  0.0194],
         [ 0.3206, -0.0727,  0.1938,  ..., -0.0127,  0.0127,  0.0777]]]), pooler_output=tensor([[-0.7686, -0.5393,  0.5043,  ..., -0.6207,  0.3355,  0.1786]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

>>> cls_output: tensor([[-0.0364, -0.0348,  0.1285,  ..., -0.0790, -0.0497, -0.0362]])

>>> pooled_output: tensor([[-0.4496,  0.7625, -0.4266,  ...,  0.2662,  0.0914,  0.4274]])

torch.Size([1, 1024]) <class 'torch.Tensor'>


In [4]:
embedding1

tensor([[-0.4496,  0.7625, -0.4266,  ...,  0.2662,  0.0914,  0.4274]])

In [5]:
text2 = "일본의 수도는 도쿄입니다."
embedding2 = model.encode(text2)
print(embedding2.shape, type(embedding2))

>>> outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0352, -0.0268,  0.0817,  ..., -0.0737, -0.0758, -0.0591],
         [ 0.2248, -0.2111, -0.0426,  ..., -0.0781,  0.0366,  0.0225],
         [ 0.0754,  0.0504,  0.1091,  ..., -0.1356,  0.1786, -0.1975],
         ...,
         [ 0.0359, -0.1343, -0.0195,  ..., -0.0056, -0.0579,  0.0591],
         [ 0.0249, -0.0988, -0.0211,  ..., -0.0679,  0.0269, -0.0064],
         [ 0.2619, -0.1069,  0.1486,  ..., -0.0265,  0.0730,  0.1024]]]), pooler_output=tensor([[-0.7595, -0.5494,  0.5127,  ..., -0.6208,  0.3494,  0.1805]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

>>> cls_output: tensor([[-0.0352, -0.0268,  0.0817,  ..., -0.0737, -0.0758, -0.0591]])

>>> pooled_output: tensor([[-0.4414,  0.7664, -0.4240,  ...,  0.2640,  0.0903,  0.4524]])

torch.Size([1, 1024]) <class 'torch.Tensor'>


In [6]:
def normalized_dot_product(tensor1, tensor2):
    # 내적 계산
    dot = torch.matmul(tensor1, tensor2.T)
    
    # 벡터의 크기 계산
    norm1 = torch.norm(tensor1)
    norm2 = torch.norm(tensor2)
    
    # 내적을 두 벡터의 크기의 곱으로 나누어 정규화
    return dot / (norm1 * norm2)

In [7]:
result = normalized_dot_product(embedding1, embedding2)
result


tensor([[0.9992]])

In [8]:
text1 = "I like him"
text2 = "we love you"

embedding1 = model.encode(text1)
embedding2 = model.encode(text2)

result = normalized_dot_product(embedding1, embedding2)
result

>>> outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0351,  0.0265,  0.0827,  ..., -0.0370, -0.0397, -0.0061],
         [-0.0692, -0.3028,  0.0670,  ..., -0.3442,  0.0717,  0.0212],
         [ 0.0102, -0.2053,  0.0080,  ...,  0.0345,  0.2458, -0.0304],
         [ 0.0167, -0.1853, -0.0502,  ...,  0.0148,  0.1555,  0.0926],
         [ 0.0952, -0.0982,  0.1613,  ...,  0.0381,  0.0117,  0.1077]]]), pooler_output=tensor([[-0.7648, -0.5135,  0.4977,  ..., -0.6362,  0.3577,  0.1871]]), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

>>> cls_output: tensor([[-0.0351,  0.0265,  0.0827,  ..., -0.0370, -0.0397, -0.0061]])

>>> pooled_output: tensor([[-0.4699,  0.7829, -0.4168,  ...,  0.2519,  0.0724,  0.4455]])

>>> outputs: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0529, -0.0350,  0.1240,  ..., -0.0478, -0.0526,  0.0187],
         [-0.0719, -0.1806,  0.0343,  ..., -0.0123,  0.1048, -0.1020],


tensor([[0.9995]])