In [1]:
import numpy as np
import pandas as pd

import torch

import ruclip
import faiss

from tqdm import tqdm

In [2]:
#Загрузка таблицы с данными книг и приведение пропусков к единому виду

items = pd.read_csv('ds/items.csv', sep=';', index_col='sys_numb')
items.replace({'отсутствует': '', 'none': '', '[б. и.]': '', np.nan: ''}, inplace=True)

In [3]:
#Формирование словаря с данными книг для построения эмбеддингов

items_data = {}
for _, item in tqdm(items.iterrows()):
    if item['title'] == '':
        continue
    item_data = ' '.join([
        item['title'],
        item['author'].replace(', ', ' '),
        #item['izd'].replace('Изд-во ', 'издательство').replace('Кн. изд-во ', 'издательство').replace('изд-во', 'издательство'),
        item['year_izd'],
        ' '.join(item['bbk'].split('\n'))        
    ])
    item_data = item_data.replace('  ', ' ').replace('..', '').strip()
    items_data[item.name] = item_data
    
item_ids = list(items_data.keys())
item_titles = list(items_data.values())

354355it [00:12, 28158.51it/s]


In [4]:
#Вычисление эмбеддингов книг с помощью модели RuClip

batch_size = 4096
clip, processor = ruclip.load('ruclip-vit-base-patch32-224', device="cuda")
predictor = ruclip.Predictor(clip, processor, "cuda", bs=batch_size)

def tokenize_titles_batch(titles_batch):
    with torch.no_grad():
        title_embeddings = predictor.get_text_latents(titles_batch)
    return title_embeddings.cpu().numpy()

item_embeddings = None
for batch_number in tqdm(range(len(item_titles) // batch_size + 1)):
    batch_embeddings = tokenize_titles_batch(item_titles[batch_number*batch_size:(batch_number + 1) * batch_size])
    if item_embeddings is None:
        item_embeddings = batch_embeddings
    else:
        item_embeddings = np.concatenate((item_embeddings, batch_embeddings) , axis=0)
        
embeddings_dict = {item_ids[i]:item_embeddings[i] for i in range(len(item_embeddings))}
items_ids =  list(embeddings_dict.keys())

100% 80/80 [14:43<00:00, 11.05s/it]


In [5]:
#1
import pickle
with open('item_embeddings_dump1p.pkl', 'wb') as f:
    pickle.dump(embeddings_dict, f)

In [6]:
#Построение индекса эмбеддингов с помощью библиотеки FAISS
items_embeddings = np.array(list(embeddings_dict.values())).astype(np.float32)
faiss_index = faiss.index_factory(512, 'Flat', faiss.METRIC_INNER_PRODUCT)
faiss_index.add(items_embeddings)
faiss.write_index(faiss_index, 'data/items.index1p')

In [7]:
#Поиск ближайших 20 соседей для каждой книги
transactions = pd.read_csv('ds/train_transactions_extended.csv', sep=';', index_col=None, usecols={'sys_numb'})
transactions = transactions['sys_numb'].tolist()

recomendations_by_title = {}
batch_size = 512
for batch_number in tqdm(range(len(transactions) // batch_size + 1)):
  batch = transactions[batch_number*batch_size:(batch_number+1)*batch_size]
  embeddings = []
  item_ids = []
  for transaction_item_id in batch:
    if transaction_item_id in embeddings_dict:
      embeddings.append(embeddings_dict[transaction_item_id])
      item_ids.append(transaction_item_id)

  embeddings = np.array(embeddings)
  distances, indexes = faiss_index.search(embeddings, 21)
  for i, item_id in enumerate(item_ids):
    recomendations_by_title[item_id] = [{items_ids[indexes[i][j]]: distances[i][j]} for j in range(1, len(indexes[i]))]

100% 507/507 [02:40<00:00,  3.17it/s]


In [10]:
#Построение рекомендаций на основе top-20 ближайших соседей

transactions = pd.read_csv(
    'ds/train_transactions_extended.csv',
    sep=';',
    index_col=None,
    usecols = ('chb', 'sys_numb', 'date_1')
)

recommendations = []
max_recommendations = 20


for user in tqdm(set(transactions['chb'].unique())):
    user_recommendations = []
    user_transactions =  transactions[transactions['chb'] == user].sort_values('date_1', ascending=False)
    
    #1й этап - по каждому пользователю осуществляется сбор рекомендаций по тем книгам, которые он заказывал ранее
    for _, transaction in user_transactions.iterrows():
        book = transaction['sys_numb']
        nearest_books = recomendations_by_title.get(book, [])
        for nearest_book in nearest_books:
            item, distance = tuple(nearest_book.items())[0]
            user_recommendations.append({
                'item': item,
                'similarity': 1.001 - distance # превращение дистанции в подобие
            })  
    
    #2й этап - все собранные рекомендации ранжируются по степени близости и отбирается top-20
    recommendations_processed = {}
    max_distance = 0
    worst_item = None
    
    for recommendation in user_recommendations:
        _item, _dist = recommendation.values()
        if _item in recommendations_processed:
            if recommendations_processed[_item] > _dist:
                recommendations_processed[_item] = _dist
        else:
            if _dist < max_distance:
                recommendations_processed[_item] = _dist
                if len(recommendations_processed) >= max_recommendations:
                    if not worst_item:
                        worst_item = list(recommendations_processed.keys())[0]
                    del recommendations_processed[worst_item]
                    max_distance = max(list(recommendations_processed.values()))
                    worst_item = list(
                        recommendations_processed.keys()
                    )[list(recommendations_processed.values()).index(max_distance)]
            else:
                if len(recommendations_processed) < max_recommendations:
                    recommendations_processed[_item] = _dist
                    worst_item = _item
                    max_distance = _dist
                
    recommendations += [(user, r_item) for r_item in recommendations_processed.keys()]

100% 16753/16753 [00:25<00:00, 661.42it/s]


In [11]:
#Запись решения в файл
result = pd.DataFrame(recommendations)
result.columns = ['chb', 'sys_numb']
result.sort_values(by=['chb'], inplace=True)
result.to_csv('solutions/solution1w.csv', index=False, sep=';', line_terminator='\n')