In [4]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import dill

from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords

import re

from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

n_samples = 2000
n_features = 1000

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kirillvolkov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data = pd.read_csv('./goodbooks-10k/books.csv')
data.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [7]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.key].fillna(self.value)

Создадим сразу глобальную переменную titles_train, которая будет содержать названия всех оригинальных книг из тренировочного набора, чтобы мы потом могли их возвращать

In [8]:
titles_train = data['original_title']

Создадим уже пайплайн наш

In [9]:
pipeline = Pipeline([
    ('original_title', ColumnSelector(key='original_title')),
    ('fill_na_title', TextImputer(key='original_title', value='')),
    ('tfidf_vectorizer', TfidfVectorizer(max_df=0.95, min_df=2, analyzer=lambda x: x, 
                                   max_features=n_features,
                                   stop_words='english'))])

model = pipeline.fit(data)



In [10]:
#создадим матрицу для всех заголовков нашего тренировочного набора данных
tfidf_matrix = model.transform(data)
tfidf_matrix

<10000x368 sparse matrix of type '<class 'numpy.float64'>'
	with 128138 stored elements in Compressed Sparse Row format>

10000 книг у нас и 368 признаков получилось

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

Пусть у нас есть "новая" книга с заголовком каким-то

In [12]:
test_title = data.iloc[101:102]
test_title

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
101,102,19543,19543,3020535,110,99408392,9780099000000.0,Maurice Sendak,1963.0,Where the Wild Things Are,...,620618,636061,9102,15392,27532,93700,167043,332394,https://images.gr-assets.com/books/1384434560m...,https://images.gr-assets.com/books/1384434560s...


Заголовок у нас такой

In [13]:
test_title.original_title.iloc[0]

'Where the Wild Things Are'

С помощью нашего пайплайна получим для нее векторное представление

In [14]:
test_tfidf = pipeline.transform(test_title)
test_tfidf

<1x368 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

Посчитаем расстояния до векторного представления title новой книги

In [15]:
cosine_sim_titles = cosine_similarity(test_tfidf, tfidf_matrix)
cosine_sim_titles

array([[0.52818882, 0.56243929, 0.42228686, ..., 0.41837151, 0.66408753,
        0.79119869]])

Воспользуемся argpartition чтобы отсортировать наш массив по убыванию расстояния и чтобы получить индексы ближайших элементов

In [16]:
ind = np.argpartition(cosine_sim_titles, -5)
ind

array([[7411, 4999,    2, ...,  101, 6587, 3975]])

In [17]:
ind[0][:5]

array([7411, 4999,    2,    3,    4])

Вытащим заголовки этих книг (топ 5 для примера)

In [19]:
titles_train.iloc[ind[0][:5]].values

array(['Binge', 'Passion Unleashed', 'Twilight', 'To Kill a Mockingbird',
       'The Great Gatsby'], dtype=object)

Введем спомогательную функцию, которая будет нам возвращать топ

In [23]:
def book_recommendations(title, n=5):
    test_tfidf = pipeline.transform(title)
    cosine_sim_titles = cosine_similarity(test_tfidf, tfidf_matrix)
    ind = np.argpartition(cosine_sim_titles, -n)
    candidates = titles_train.iloc[ind[0][:n]].values
    return candidates

In [33]:
book_recommendations(data.iloc[200:201])

array(['84, Charing Cross Road', 'Passion Unleashed', 'Twilight',
       'Means of Ascent ', 'Billy Budd, Sailor'], dtype=object)

In [26]:
#сохраняем пайплайн
with open("book_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)