In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from scipy.spatial.distance import chebyshev, correlation
from sklearn.metrics.pairwise import rbf_kernel
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm


In [3]:
combined = pd.read_csv('data/combined_parsing_with_embeddings.csv')
combined.head(10)

(5516, 6)

In [6]:
model = SentenceTransformer('cointegrated/rubert-tiny2')
sentences = [combined.loc[0,'description']]
embeddings = model.encode(sentences)
print(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 1.44859236e-02  6.38538450e-02  4.40493114e-02 -7.01796860e-02
  -4.24850620e-02  2.11618766e-02  6.36292249e-02 -3.58819515e-02
   3.98085639e-02  1.13250501e-01  7.47837424e-02  2.86929291e-02
   6.36755303e-02  4.07855324e-02  3.91280204e-02 -2.62853969e-03
  -8.83563142e-03 -1.54918721e-02 -4.73152138e-02 -3.93175222e-02
   7.96613935e-03  1.29092447e-02  5.54236397e-02 -2.26539504e-02
   1.02837533e-01 -2.31180452e-02  6.28630072e-02  1.49335964e-02
   3.96468351e-03  1.00738697e-01 -4.27448750e-02  8.24392885e-02
  -4.20459509e-02 -1.13332696e-01 -7.92105682e-03  1.94631275e-02
   2.79171858e-03  7.91716762e-03 -3.34213376e-02 -3.20910066e-02
  -2.92678899e-03  2.42600650e-01  6.39756620e-02  2.20747255e-02
  -4.62388471e-02  2.90931147e-02 -4.15609926e-02 -9.08209104e-03
  -1.21900998e-02 -2.83281691e-03 -6.35129213e-02  7.30746007e-03
   3.56563069e-02 -2.25935597e-02  6.41763657e-02 -3.64960684e-03
  -3.34842838e-02 -1.59060173e-02  2.08748262e-02 -1.68171152e-03
   2.43257

In [8]:
#Конвертация строк обратно в тензоры
combined[['emb_msdisbert','emb_rubert']] = combined[['emb_msdisbert','emb_rubert']].map(lambda x: torch.FloatTensor(list(map(float,x.split(',')))))

In [10]:
#rubert

rb_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
rb_model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# model.cuda()  # uncomment it if you have a GPU

distmodel = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3', token='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')

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [49]:
query = 'Фильм о краже картин и художнике'


rubert_q = embed_bert_cls(query, rb_model, rb_tokenizer)
msdist_q = distmodel.encode(query)

In [50]:
combined['rubert_sim'] = combined.apply(lambda row: cosine_similarity(row['emb_rubert'].reshape(1,-1),rubert_q.reshape(1,-1))[0][0], axis=1)
combined['dist_sim'] = combined['emb_msdisbert'].apply(lambda x: cosine_similarity(msdist_q.reshape(1,-1), x.reshape(1,-1))[0][0])

In [51]:
for model_type in ['rubert_sim','dist_sim']:
    print(f"Top 10 Similar Movies by {model_type}:")
    for index, row in combined.sort_values(by=[model_type],ascending=False).head(10).iterrows():
        print(f"Title: {row['movie_title']} - Similarity: {row[model_type]:.4f}")
    print()

Top 10 Similar Movies by rubert_sim:
Title: Вечный свет - Similarity: 0.6936
Title: Искусство по понятиям - Similarity: 0.6918
Title: Рай для дурака - Similarity: 0.6874
Title: Пять процентов - Similarity: 0.6562
Title: Главная роль - Similarity: 0.6550
Title: Ван Гог. С любовью, Винсент - Similarity: 0.6546
Title: Фото на память - Similarity: 0.6451
Title: Грех - Similarity: 0.6450
Title: Искусный вор (2023) - Similarity: 0.6440
Title: Даааааали! - Similarity: 0.6428

Top 10 Similar Movies by dist_sim:
Title: Чувства Анны - Similarity: 0.9406
Title: Вычитание - Similarity: 0.9405
Title: Вихрь - Similarity: 0.9387
Title: Керосин - Similarity: 0.9383
Title: Ужасающий - Similarity: 0.9315
Title: Игра на выживание - Similarity: 0.9307
Title: У нас привидение! - Similarity: 0.9301
Title: Первый снег - Similarity: 0.9283
Title: Родители строгого режима - Similarity: 0.9277
Title: Солдат - Similarity: 0.9269

