In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")



tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [2]:
import pandas as pd
import torch
from datasets import Dataset
import datasets

In [None]:
df_articles = pd.read_csv('../data/anekdots.csv', index_col=0)

df_articles.head()

In [None]:
text_example = df_articles['text_clean'][300]
text_example

In [None]:
input_ids = tokenizer.encode(text=text_example)
tokens = tokenizer.tokenize(text=text_example)

print(len(input_ids), len(tokens))
for tok, id in zip(tokens[:10], input_ids[1:10]):
    print(id, tok)

print(tokenizer.decode(token_ids=input_ids))

In [None]:
encoded_input = tokenizer(text_example, padding=True, truncation=True, max_length=10, return_tensors='pt')

In [None]:
encoded_input

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
print(model)

In [None]:
with torch.no_grad():
    out = model(**encoded_input)

In [None]:
out.last_hidden_state[:,0,:].shape

In [None]:
mean_pooling(out, encoded_input['attention_mask']).shape

In [None]:
anekdot_dataset = Dataset.from_pandas(df_articles)
anekdot_dataset

In [9]:
device = 'cpu'

In [8]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0, :]

In [7]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, max_length=128, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
with torch.no_grad():
    embedding = get_embeddings(anekdot_dataset["text_clean"][0])
embedding.shape

In [None]:
embeddings_dataset = anekdot_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text_clean"]).detach().cpu().numpy()[0]}
)

In [3]:
embeddings_dataset = datasets.load_from_disk('../data/embeddings_dataset')

In [4]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/498 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'text_clean', 'text_len', '__index_level_0__', 'embeddings'],
    num_rows: 497589
})

In [5]:
import html
import re

# define clean function
# add / remove any line if necessary
def clean(text):
    # convert html escapes like &amp; by their plain-text representation
    text = html.unescape(text) 
    
    # subsitute tags like <tab> by spaces in the specified text or remove them
    text = re.sub(r'<[^<>]*>', ' ', text)
    
    # subsitute markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    
    # subsitute text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    
    # subsitute standalone sequences of specials, matches &# but NOT #hashtag
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    
    # subsitute standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

In [26]:
question = """
Генерал видит часового на посту с открытым над головой зонтом.
— За всю свою долгую службу ни разу не видел солдата, который бы боялся дождя.
— А я и не боюсь, у меня зонт.
"""
question_embedding = get_embeddings([clean(question)]).detach().numpy() # .cpu().detach().numpy()
question_embedding.shape

(1, 1024)

In [27]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=1
)

In [28]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)

In [29]:
print(samples_df.text[0])

Идут по пляжу сержант и рядовой. Рядовой:
– Товарищ сержант, не правда ли, вон у той девушки очень красивые ноги?
– Не знаю, я еще не видел, как она марширует.


In [35]:
scores[0]

151.87032