In [1]:
import faiss
import numpy as np
import pandas as pd
import ruclip
import torch
from tqdm import tqdm

In [2]:
# Загрузка таблицы с данными книг и приведение пропусков к единому виду

items = pd.read_csv('ds/items.csv', sep=';', index_col='sys_numb')
items.replace({'отсутствует': '', 'none': '', '[б. и.]': '', np.nan: ''}, inplace=True)

In [3]:
# Формирование словаря с данными книг для построения эмбеддингов
   
items_data = {}
for _, item in tqdm(items.iterrows()):
    if item['title'] == '':
        continue
    item_data = ' '.join([
        item['title'],
        item['author'].replace(', ', ' '),
        item['year_izd'],
        ' '.join(item['bbk'].split('\n'))        
    ])
    item_data = item_data.replace('  ', ' ').replace('..', '').strip()
    items_data[item.name] = item_data
    
item_ids = list(items_data.keys())
item_titles = list(items_data.values())
    
item_ids = list(items_data.keys())
item_titles = list(items_data.values())

354355it [00:12, 29064.77it/s]


In [4]:
# Вычисление эмбеддингов книг с помощью модели RuClip

batch_size = 4096  # для 16GB VRAM, необходимо уменьшить для GPU с меньшим количеством видеопамяти
clip, processor = ruclip.load('ruclip-vit-base-patch32-224', device="cuda")
predictor = ruclip.Predictor(clip, processor, "cuda", bs=batch_size)


def tokenize_titles_batch(titles_batch):
    with torch.no_grad():
        title_embeddings = predictor.get_text_latents(titles_batch)
    return title_embeddings.cpu().numpy()


item_embeddings = None
for batch_number in tqdm(range(len(item_titles) // batch_size + 1)):
    batch_embeddings = tokenize_titles_batch(item_titles[batch_number*batch_size:(batch_number + 1) * batch_size])
    if item_embeddings is None:
        item_embeddings = batch_embeddings
    else:
        item_embeddings = np.concatenate((item_embeddings, batch_embeddings), axis=0)
        
embeddings_dict = {item_ids[i]: item_embeddings[i] for i in range(len(item_embeddings))}
items_ids = list(embeddings_dict.keys())

100% 80/80 [14:46<00:00, 11.09s/it]


In [5]:
# сохранение словаря эмбеддингов
import pickle
with open('item_embeddings_dump11.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)

In [6]:
# Построение индекса эмбеддингов с помощью библиотеки FAISS
items_embeddings = np.array(list(embeddings_dict.values())).astype(np.float32)
faiss_index = faiss.index_factory(512, 'Flat', faiss.METRIC_INNER_PRODUCT)
res = faiss.StandardGpuResources()
gpu_faiss_index = faiss.index_cpu_to_gpu(res, 0, faiss_index)
gpu_faiss_index.add(items_embeddings)

In [7]:
# Поиск ближайших 20 соседей для каждой книги
transactions = pd.read_csv('ds/train_transactions_extended.csv', sep=';', index_col=None, usecols={'sys_numb'})
transactions = transactions['sys_numb'].tolist()

recommendations = {}
batch_size = 512
for batch_number in tqdm(range(len(transactions) // batch_size + 1)):
    batch = transactions[batch_number*batch_size:(batch_number+1)*batch_size]
    embeddings = []
    item_ids = []
    for transaction_item_id in batch:
        if transaction_item_id in embeddings_dict:
            embeddings.append(embeddings_dict[transaction_item_id])
            item_ids.append(transaction_item_id)

    embeddings = np.array(embeddings)
    distances, indexes = gpu_faiss_index.search(embeddings, 21)
    for i, item_id in enumerate(item_ids):
        recommendations[item_id] = [{items_ids[indexes[i][j]]: distances[i][j]} for j in range(1, len(indexes[i]))]

100% 507/507 [00:08<00:00, 61.16it/s]


In [8]:
# Построение рекомендаций на основе top-20 ближайших соседей

transactions = pd.read_csv(
    'ds/train_transactions_extended.csv',
    sep=';',
    index_col=None,
    usecols=('chb', 'sys_numb', 'date_1')
)

final_recommendations = []

for user in tqdm(set(transactions['chb'].unique())):
    user_transactions = transactions[transactions['chb'] == user].sort_values('date_1', ascending=False)
    user_recommendations = {}
    user_readed_books = set(user_transactions['chb'].to_list())
    processed_books = set()
    for _, transaction in user_transactions.iterrows():
        book = transaction['sys_numb']
        if book in processed_books:
            continue
        else:
            processed_books.add(book)
        nearest_books = recommendations.get(book, [])
        for book in nearest_books:
            book, similarity = tuple(book.items())[0]
            if book not in user_recommendations:
                user_recommendations[book] = similarity
            else:
                if user_recommendations[book] < similarity:
                    user_recommendations[book] = similarity
    
    user_recommendations = [
        k for k, v in sorted(
            user_recommendations.items(),
            key=lambda item: item[1],
            reverse=True
        ) if k not in user_readed_books
    ]
    
    for recommendation in user_recommendations[:20]:
        final_recommendations.append((user, recommendation))

100% 16753/16753 [00:17<00:00, 945.27it/s] 


In [9]:
# Запись решения в файл
result = pd.DataFrame(final_recommendations)
result.columns = ['chb', 'sys_numb']
result.sort_values(by=['chb'], inplace=True)
result.to_csv('solutions/solution11.csv', index=False, sep=';', line_terminator='\n')