In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Sentences we want sentence embeddings for
sentences = ["样例数据-1", "样例数据-2"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
model.eval()

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)


  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence embeddings: tensor([[ 0.0015,  0.0165, -0.0281,  ..., -0.0309,  0.0297, -0.0327],
        [ 0.0151,  0.0041, -0.0157,  ..., -0.0281,  0.0408, -0.0251]])


In [5]:
sentence_embeddings[0]


tensor([ 0.0015,  0.0165, -0.0281,  ..., -0.0309,  0.0297, -0.0327])

In [6]:
sentence_embeddings[1]

tensor([ 0.0151,  0.0041, -0.0157,  ..., -0.0281,  0.0408, -0.0251])

In [7]:
sentence_embeddings[0] @ sentence_embeddings[1]

tensor(0.8792)

In [None]:
sentences = [
    """
    \\begin{algorithm}[h]
        \\caption{$(3+\\epsilon)$-Approximation for Minimum-Norm Capacitated $k$-Clustering}
        \\label{algo:MNCkC}
        \\begin{algorithmic}[1]
            \\State define set $\\bfT$ of non-negative reals with $|\\bfT| \\leq O\\left(\\frac{\\log n}{\\epsilon}\\right)$, as described in text \\label{step:MNCkC-T}
            \\label{step:MNCkC-bfT}
            \\For{every $t \\in \\bfT$} %\\Comment{for each $t$, construct a top-$t$ norm capacitated $k$-clustering instance}
                \\State with objective $h(v) := \\sum_{j \\in C}(v_j - t)^+$ for every $v \\in \\R_{\\geq 0}^C$, use Theorem~\\ref{thm:pseudo-approx} to obtain a set $\\calS^t$ of $O\\left(\\frac{k \\ln n}{\\epsilon}\\right)$ valid stars \\label{step:MNCkC-calS}
                \\State $S^t \\gets \\{i \\in F: \\exists J, (i, J) \\in \\calS^t\\}$ \\label{step:MNCkC-S}
            \\EndFor
            %\\State $S \\leftarrow \\cup_{t \\in \\bfT} S^t, R \\leftarrow \\cup_{t \\in \\bfT} R^t$
            \\State randomly choose a color function $\\mathtt{color}: F \\to [k]$ \\label{step:MNCkC-color}
            \\State guess the types of each color $c \\in [k]$ \\Comment{each color is of type-1, 2 or 3} \\label{step:MNCkC-type}
            \\For{every $t \\in \\bfT$}
            $R^t \\leftarrow \\texttt{MNCkC-choose-R}(t)$ \\label{step:MNCkC-R} \\Comment{clients in $R^t$ are called \\emph{representatives}}
            \\EndFor
            \\State $S \\leftarrow \\union_{t \\in \\bfT} S^t, R \\leftarrow \\union_{t \\in \\bfT} R^t$ \\label{step:MNCkC-merge}
            \\State guess a pivot $p_c$ for each type-1 or 2 color $c \\in [k]$, and $i^*_c$ for each type-3 color $c \\in [k]$ such that \\label{step:MNCkC-pivots}
            \\Statex \\Comment{$p_c$'s for type-1 and 2 colors $c$, $i^*_c$'s for all colors $c \\in [k]$ are defined in text}
            \\begin{itemize}
                \\item $p_c \\in R$ if $c$ is of type-1, $p_c \\in S$ if $c$ is of type-2, and
                \\item $i^*_c \\in S$ has color $c$ for a type-3 color $c$.
            \\end{itemize}
            \\State guess a $(1+\\epsilon)$-approximate overestimation $r_c$ for $d(i^*_c, p_c)$ for every type-1 or 2 color $c$ \\label{step:MNCkC-radius}
            \\State \\Return $\\texttt{MNCkC-clustering-with-pivots}()$ \\Comment{See Algorithm \\ref{algo:MNCkC-clustering-with-pivots} for its definition} \\label{step:MNCkC-return}
        \\end{algorithmic}
    \\end{algorithm}
    """,
    """
    \\begin{algorithm}[H]
    \\caption{In-Place N-Dimensional Sub-Tensor Reversal}
    \\label{alg:reverse}
    \\begin{algorithmic}[1]
    \\State \\textbf{function} ReverseND(Tensor $T$, StartIndices $\\mathbf{s}$, EndIndices $\\mathbf{e}$)
    \\State \\quad $n \\gets \\text{dimensionality of } T$
    \\State \\quad \\text{Initialize current index } $\\mathbf{i} \\gets \\mathbf{s}$
    \\State \\quad \\textbf{loop}
    \\State \\quad \\quad \\text{Calculate mirror index } $\\mathbf{j} \\gets \\mathbf{s} + \\mathbf{e} - \\mathbf{i}$ \\Comment{Component-wise operation}
    \\State \\quad \\quad \\textbf{if} $\\mathbf{i}$ is lexicographically $\\ge$ $\\mathbf{j}$ \\textbf{then}
    \\State \\quad \\quad \\quad \\textbf{break} \\Comment{All pairs have been swapped}
    \\State \\quad \\quad \\textbf{end if}
    \\State \\quad \\quad swap($T[\\mathbf{i}], T[\\mathbf{j}]$)
    \\State \\quad \\quad \\Comment{Increment index $\\mathbf{i}$ to the next position}
    \\State \\quad \\quad $d \\gets n - 1$
    \\State \\quad \\quad \\textbf{while} $d \\ge 0$ \\textbf{do}
    \\State \\quad \\quad \\quad \\textbf{if} $i_d < e_d$ \\textbf{then}
    \\State \\quad \\quad \\quad \\quad $i_d \\gets i_d + 1$
    \\State \\quad \\quad \\quad \\quad \\textbf{break}
    \\State \\quad \\quad \\quad \\textbf{else}
    \\State \\quad \\quad \\quad \\quad $i_d \\gets s_d$
    \\State \\quad \\quad \\quad \\quad $d \\gets d - 1$
    \\State \\quad \\quad \\quad \\textbf{end if}
    \\State \\quad \\quad \\textbf{end while}
    \\State \\quad \\quad \\textbf{if} $d < 0$ \\textbf{then break} \\Comment{All indices exhausted}
    \\State \\quad \\textbf{end loop}
    \\State \\textbf{end function}
    \\end{algorithmic}
    \\end{algorithm}
    """
 ]

# Process each sentence and collect embeddings
all_embeddings = []
for sentence in sentences:
    embedding = get_embeddings_for_long_text(sentence, tokenizer, model)
    all_embeddings.append(embedding)

# Stack and normalize
sentence_embeddings = torch.stack(all_embeddings)
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)

Sentence embeddings: tensor([[[ 1.,  1.,  1.,  ...,  1.,  1., -1.]],

        [[ 1.,  1.,  1.,  ...,  1.,  1., -1.]]])


In [None]:
sentence_embeddings[0] @ sentence_embeddings[1]

tensor([[522.]])