In [14]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

import nmslib

import pickle

import warnings
warnings.filterwarnings('ignore')

In [15]:
ratings = pd.read_csv('./data/ratings.csv')
books = pd.read_csv('./data/books.csv')
tags = pd.read_csv('./data/tags.csv')
book_tags = pd.read_csv('./data/book_tags.csv')

In [16]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [17]:
tags = pd.read_csv('./data/tags_cleaned.csv')
tags.head()

Unnamed: 0,tag_id,tag_name
0,509,19th-century
1,923,20th-century
2,941,21st-century
3,1499,abuse
4,1540,action


In [18]:
# Создадим словарь сопоставлений названий колонок
mapper = dict(zip(books['goodreads_book_id'], books['book_id']))
mapper

{2767052: 1,
 3: 2,
 41865: 3,
 2657: 4,
 4671: 5,
 11870085: 6,
 5907: 7,
 5107: 8,
 960: 9,
 1885: 10,
 77203: 11,
 13335037: 12,
 5470: 13,
 7613: 14,
 48855: 15,
 2429135: 16,
 6148028: 17,
 5: 18,
 34: 19,
 7260188: 20,
 2: 21,
 12232938: 22,
 15881: 23,
 6: 24,
 136251: 25,
 968: 26,
 1: 27,
 7624: 28,
 18135: 29,
 8442457: 30,
 4667024: 31,
 890: 32,
 930: 33,
 10818853: 34,
 865: 35,
 3636: 36,
 100915: 37,
 14050: 38,
 13496: 39,
 19501: 40,
 28187: 41,
 1934: 42,
 10210: 43,
 15931: 44,
 4214: 45,
 43641: 46,
 19063: 47,
 4381: 48,
 49041: 49,
 30119: 50,
 256683: 51,
 428263: 52,
 113436: 53,
 11: 54,
 5129: 55,
 1162543: 56,
 37435: 57,
 2956: 58,
 24178: 59,
 1618: 60,
 22557272: 61,
 119322: 62,
 6185: 63,
 10917: 64,
 4981: 65,
 18405: 66,
 128029: 67,
 22628: 68,
 11735983: 69,
 375802: 70,
 18490: 71,
 11588: 72,
 1656001: 73,
 99561: 74,
 227443: 75,
 14935: 76,
 38709: 77,
 5139: 78,
 1381: 79,
 157993: 80,
 7445: 81,
 1845: 82,
 1953: 83,
 7677: 84,
 370493: 85,
 32

In [19]:
book_tags = book_tags[book_tags['tag_id'].isin(tags['tag_id'])]
book_tags['id'] = book_tags['goodreads_book_id'].apply(lambda x: mapper[x])
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [20]:
# Создадим разряженную матрицу взаимодействия пользователей с книгами
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))

In [21]:
# число потоков процессора (зависит от того, на какой машине запускаете)
NUM_THREADS = 1

#число параметров вектора 
NUM_COMPONENTS = 60 

#число эпох обучения
NUM_EPOCHS = 10

#зерно датчика случайных чисел
RANDOM_STATE = 42

In [22]:
#Разбиваем датасет на обучающую и тестовую выборки
train, test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=RANDOM_STATE)

#Создаём модель
model = LightFM(
    learning_rate=0.05, #темп (скорость) обучения
    loss='warp', #loss-функция
    no_components=NUM_COMPONENTS,#размерность вектора признаков
    random_state=RANDOM_STATE #генератор случайных чисел
)

#Обучаем модель
model = model.fit(
    train, #обучающая выборка
    epochs=NUM_EPOCHS, #количество эпох обучения
    num_threads=NUM_THREADS, #количество потоков процессора
    item_features=feature_ratings #признаки товаров (рейтинги книг)
)


In [23]:
# Тестируем нашу модель
precision_score = precision_at_k(
    model,  # модель
    test,  # тестовая выборка
    num_threads=NUM_THREADS,  # количество потоков процессора
    k=10,  # количество предложений
    item_features=feature_ratings  # признаки товаров
).mean()  # усредняем результаты

recall_score = recall_at_k(
    model,  # модель
    test,  # тестовая выборка
    num_threads=NUM_THREADS,  # количество потоков процессора
    k=10,  # количество предложений
    item_features=feature_ratings  # признаки товаров
).mean()  # усредняем результаты

print(recall_score, precision_score)

0.04008034798209189 0.08673393


In [24]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)#, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
with open('./model.pkl', 'rb') as file:
    model = pickle.load(file)

Эмбединг

In [32]:
# Извлекаем эмбеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

print(item_biases.shape, item_embeddings.shape)


(10001,) (10001, 60)


In [30]:
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
nms_idx.addDataPointBatch(item_embedings)
nms_idx.createIndex(print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


In [31]:
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embedings[book_id],k =n)
    return nn

In [33]:
#Отфильтруем только те книги, в которых названии встречается подстрока "1984"
books[books['title'].apply(lambda x: x.lower().find('1984')) >= 0]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
12,13,5470,5470,153313,995,451524934,9780452000000.0,"George Orwell, Erich Fromm, Celâl Üster",1949.0,Nineteen Eighty-Four,...,1956832,2053394,45518,41845,86425,324874,692021,908229,https://images.gr-assets.com/books/1348990566m...,https://images.gr-assets.com/books/1348990566s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
9795,9796,201145,201145,2563528,25,64440508,9780064000000.0,"Else Holmelund Minarik, Maurice Sendak",1968.0,A Kiss for Little Bear,...,11063,11604,126,87,284,1898,3053,6282,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


In [34]:
#Вызываем функцию для поиска ближайших соседей
nearest_books_nms(846, nms_idx)

(array([846,  14,  55, 809,  13,  48, 289, 271, 375, 173], dtype=int32),
 array([0.        , 0.03544855, 0.04098177, 0.05688703, 0.06425363,
        0.0703209 , 0.08217251, 0.08694875, 0.08821321, 0.08975214],
       dtype=float32))

In [35]:
#Выделяем идентификаторы рекомендованных книг
nbm = nearest_books_nms(846, nms_idx)[0]
nbm

array([846,  14,  55, 809,  13,  48, 289, 271, 375, 173], dtype=int32)

In [36]:
#Посмотрим на авторов и названия рекомендованных книг
books[books.book_id.isin(nbm)][['authors', 'title']]

Unnamed: 0,authors,title
12,"George Orwell, Erich Fromm, Celâl Üster",1984
13,George Orwell,Animal Farm
47,Ray Bradbury,Fahrenheit 451
54,Aldous Huxley,Brave New World
172,Anthony Burgess,A Clockwork Orange
270,Daniel Keyes,Flowers for Algernon
288,Richard Adams,"Watership Down (Watership Down, #1)"
374,Jack London,The Call of the Wild
808,"Aldous Huxley, Christopher Hitchens",Brave New World / Brave New World Revisited
845,"George Orwell, Christopher Hitchens",Animal Farm / 1984


In [37]:
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file)#, protocol=pickle.HIGHEST_PROTOCOL)