In [9]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from scipy.spatial.distance import chebyshev, correlation
from sklearn.metrics.pairwise import rbf_kernel
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

In [7]:
combined = pd.read_csv('data/combined_parsing_with_embeddings.csv')
combined.head(10)

Unnamed: 0,page_url,image_url,movie_title,description,emb_msdisbert,emb_rubert
0,https://kinogo.online/filmy/102112-hanna-v-igr...,kinogo.online/uploads/mini/fullstory/75/d88deb...,Ханна. В игре (2024),"Ханна – молодая девушка, страдающая от агорафо...","-0.46084476,-0.16368349,0.07980391,-0.6657749,...","0.014485924,0.063853845,0.04404931,-0.07017968..."
1,https://kinogo.online/filmy/102111-bajkery.html,kinogo.online/uploads/mini/fullstory/f5/fdcada...,Байкеры (2023),"Остросюжетная картина, разворачивающаяся на Ср...","-0.5326852,-0.23060842,0.16051687,-0.56171346,...","0.04075556,-0.047768865,-0.023799775,-0.038850..."
2,https://kinogo.online/filmy/102021-jekzorcizm....,kinogo.online/uploads/mini/fullstory/f3/f419ff...,Экзорцизм (2024),"История об актере по имени Энтони, согласившем...","-0.61657906,-0.15813282,0.29164374,-0.5973491,...","-0.0682673,0.042816307,-0.06475405,-0.08741514..."
3,https://kinogo.online/filmy/102142-bolero-dush...,kinogo.online/uploads/mini/fullstory/30/43d33a...,Болеро. Душа Парижа (2024),События картины разворачиваются в Париже и зна...,"-0.5000128,-0.20215347,0.24276419,-0.63409877,...","0.02861084,0.010292943,-0.008946151,-0.0205987..."
4,https://kinogo.online/filmy/102271-zombi-v-seu...,kinogo.online/uploads/mini/fullstory/1c/c07ec9...,Зомби в Сеуле (2024),Фильм ужасов о молодой танцовщице по имени Си ...,"-0.5201405,-0.1288207,0.3313515,-0.6584862,0.6...","-0.030309511,0.03474685,-0.0334166,-0.07098175..."
5,https://kinogo.online/filmy/54528-kto-ugodno-k...,kinogo.online/uploads/mini/fullstory/4e/894333...,"Кто угодно, кроме тебя (2023)","Би – молодая девушка, всегда мечтавшая обзавес...","-0.41063425,-0.0830526,0.049538035,-0.74925107...","0.0847645,0.0651581,0.032940105,-0.034403432,-..."
6,https://kinogo.online/filmy/51097-kniga-reshen...,kinogo.online/uploads/mini/fullstory/8d/36364f...,Книга решений (2023),Фильм рассказывает о жизни необычного парня по...,"-0.5318866,-0.0229408,0.14009441,-0.60780185,0...","-0.018684108,0.0017345467,-0.025695119,-0.0441..."
7,https://kinogo.online/filmy/1970-vmesto-nas-dv...,kinogo.online/uploads/mini/fullstory/a5/fd307d...,Вместо нас двоих (2016),В небольшой деревушке молодая девушка по имени...,"-0.46826285,-0.054471053,-0.013140891,-0.59218...","0.00017356129,0.056439854,-0.008188479,-0.0607..."
8,https://kinogo.online/filmy/100588-nuzhdy-pute...,kinogo.online/uploads/mini/fullstory/a0/e268da...,Нужды путешественника (2024),"Ирис – молодая девушка, о которой никто ничего...","-0.581347,-0.14413577,0.09750812,-0.6048381,0....","0.09578059,0.051227942,0.0054312362,-0.0047080..."
9,https://kinogo.online/filmy/102188-svezhie-ubi...,kinogo.online/uploads/mini/fullstory/d8/37d0ae...,Свежие убийства (2023),Мы переносимся в Нью-Йорк 80-х годов. Франсин ...,"-0.5330533,-0.21125765,0.37169516,-0.6022329,0...","0.0633663,-0.03365965,0.008783014,-0.021746349..."


In [8]:
#Конвертация строк обратно в тензоры
combined[['emb_msdisbert','emb_rubert']] = combined[['emb_msdisbert','emb_rubert']].map(lambda x: torch.FloatTensor(list(map(float,x.split(',')))))

In [10]:
#rubert

rb_tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
rb_model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
# model.cuda()  # uncomment it if you have a GPU

distmodel = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3', token='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')

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [49]:
query = 'Фильм о краже картин и художнике'


rubert_q = embed_bert_cls(query, rb_model, rb_tokenizer)
msdist_q = distmodel.encode(query)

In [50]:
combined['rubert_sim'] = combined.apply(lambda row: cosine_similarity(row['emb_rubert'].reshape(1,-1),rubert_q.reshape(1,-1))[0][0], axis=1)
combined['dist_sim'] = combined['emb_msdisbert'].apply(lambda x: cosine_similarity(msdist_q.reshape(1,-1), x.reshape(1,-1))[0][0])

In [51]:
for model_type in ['rubert_sim','dist_sim']:
    print(f"Top 10 Similar Movies by {model_type}:")
    for index, row in combined.sort_values(by=[model_type],ascending=False).head(10).iterrows():
        print(f"Title: {row['movie_title']} - Similarity: {row[model_type]:.4f}")
    print()

Top 10 Similar Movies by rubert_sim:
Title: Вечный свет - Similarity: 0.6936
Title: Искусство по понятиям - Similarity: 0.6918
Title: Рай для дурака - Similarity: 0.6874
Title: Пять процентов - Similarity: 0.6562
Title: Главная роль - Similarity: 0.6550
Title: Ван Гог. С любовью, Винсент - Similarity: 0.6546
Title: Фото на память - Similarity: 0.6451
Title: Грех - Similarity: 0.6450
Title: Искусный вор (2023) - Similarity: 0.6440
Title: Даааааали! - Similarity: 0.6428

Top 10 Similar Movies by dist_sim:
Title: Чувства Анны - Similarity: 0.9406
Title: Вычитание - Similarity: 0.9405
Title: Вихрь - Similarity: 0.9387
Title: Керосин - Similarity: 0.9383
Title: Ужасающий - Similarity: 0.9315
Title: Игра на выживание - Similarity: 0.9307
Title: У нас привидение! - Similarity: 0.9301
Title: Первый снег - Similarity: 0.9283
Title: Родители строгого режима - Similarity: 0.9277
Title: Солдат - Similarity: 0.9269

