In [1]:
import pandas as pd
import numpy as np

import pickle
from pathlib import Path
from tqdm import tqdm

In [3]:
full_dataset = pd.concat([pd.read_csv(str(x.resolve())) for x in Path("data/").glob("*k.csv")])

columns = ['Id', 'Name', 'RatingDist1', 'RatingDist2', 'RatingDist3',
           'RatingDist4', 'RatingDist5', 'Rating', 'RatingDistTotal', 'pagesNumber', 'Publisher',
           'Authors', 'Language', 'Description']
full_dataset = full_dataset[columns]

full_dataset = full_dataset[~full_dataset['Description'].isna()]
full_dataset['RatingDistTotal'] = full_dataset['RatingDistTotal'].apply(lambda x: float(x.replace('total:', '')))

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

full_dataset = reduce_mem_usage(full_dataset)

Mem. usage decreased to 113.93 Mb (15.0% reduction)


In [6]:
import re

def text_preprocessing_step(text):
    text = text.lower()

    # Удаляем html-теги
    re_html = re.compile(r'<.*?>')
    text = re_html.sub(' ', text)

    return text.strip()

# Т.к. книг много, ограничим их кол-во, для экономии времени
full_dataset = full_dataset[full_dataset['Description'].apply(len) >= 50]
full_dataset = full_dataset[full_dataset['RatingDistTotal'] >= 2000]

descriptions = full_dataset['Description'].apply(text_preprocessing_step)

In [5]:
# Можно загрузить уже посчитанный и сохранённый набор векторов.

with open('result/embeddings_v1.pkl', 'rb') as file:
    embeddings = pickle.load(file)

In [7]:
filters = (full_dataset['Language'] == 'eng') | (full_dataset['Language'].isna())

embeddings = embeddings[filters]
descriptions = descriptions[filters]
full_dataset = full_dataset[filters]

In [8]:
with open('result/authors_dict_v1.pkl', 'rb') as file:
    authors_dict = pickle.load(file)

with open('result/books_dict_v1.pkl', 'rb') as file:
    books_dict = pickle.load(file)

In [9]:
user_rating = pd.concat([pd.read_csv(str(x.resolve())) for x in Path("data/").glob("user_*.csv")])

In [11]:
full_dataset

Unnamed: 0,Id,Name,RatingDist1,RatingDist2,RatingDist3,RatingDist4,RatingDist5,Rating,RatingDistTotal,pagesNumber,Publisher,Authors,Language,Description
9,1000014,Haroun and the Sea of Stories,1:458,2:1512,3:5842,4:10325,5:10441,4.011719,28578.0,219.0,Turtleback Books,Salman Rushdie,eng,The author of The Satanic Verses returns with ...
10,1000020,There and Back Again: An Actor's Tale,1:129,2:289,3:660,4:601,5:1266,3.880859,2945.0,0.0,Macmillan Audio,Sean Astin,,<b>The fascinating memoir of a Hollywood life ...
14,1000030,Anne of Green Gables,1:12153,2:24252,3:97335,4:201304,5:363709,4.261719,698753.0,0.0,Tantor Media,L.M. Montgomery,,When Marilla and Matthew Cuthbert of Green Gab...
44,1000089,"Trojan Odyssey (Dirk Pitt, #17)",1:119,2:611,3:4346,4:6893,5:4881,3.939453,16850.0,485.0,G. P. Putnam's Sons,Clive Cussler,eng,Long hailed as the grand master of adventure f...
75,1000153,"Jackie & Me (A Baseball Card Adventure, #2)",1:36,2:92,3:481,4:995,5:1437,4.218750,3041.0,160.0,HarperCollins,Dan Gutman,eng,"Like every other kid in his class, Joe Stoscac..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40783,999701,The Hound of the Baskervilles: Easy Readers,1:1602,2:6788,3:45575,4:97279,5:92546,4.121094,243790.0,95.0,Klett Schulbuchverlag,Arthur Conan Doyle,eng,This popular series of readers has now been co...
40806,999794,I Wish I Had a Red Dress,1:30,2:122,3:619,4:1096,5:1259,4.101562,3126.0,336.0,William Morrow,Pearl Cleage,,Joyce Mitchell was widowed far too young when ...
40819,999823,El Arte de Ensonar,1:79,2:215,3:766,4:1375,5:1842,4.101562,4277.0,304.0,Rayo,Carlos Castañeda,,"El Arte de Ensoñar, el noveno y mas reciente l..."
40820,999824,The Power of Silence: Further Lessons of Don Juan,1:31,2:84,3:416,4:841,5:1260,4.218750,2632.0,286.0,Simon & Schuster,Carlos Castañeda,,The Power of Silence is Castaneda's most memor...


In [12]:

user_rating = pd.merge(user_rating, full_dataset[['Id', 'Name']], on='Name', suffixes=('', '_book'))

In [14]:
user_rating.head()

Unnamed: 0,ID,Name,Rating,Id
0,1,The Restaurant at the End of the Universe (Hit...,it was amazing,862825
1,73,The Restaurant at the End of the Universe (Hit...,really liked it,862825
2,116,The Restaurant at the End of the Universe (Hit...,it was amazing,862825
3,171,The Restaurant at the End of the Universe (Hit...,really liked it,862825
4,338,The Restaurant at the End of the Universe (Hit...,liked it,862825


In [18]:
user_rating['Rating'].unique()

array(['it was amazing', 'really liked it', 'liked it', 'it was ok',
       'did not like it'], dtype=object)

In [16]:
user_rating['ID'].nunique(), user_rating.shape

(3757, (450652, 4))

In [19]:
user_rating_pos = user_rating[user_rating['Rating'].isin(['it was amazing', 'really liked it', 'liked it'])]

In [25]:
book_pairs = {}

for user_id in tqdm(user_rating_pos['ID'].unique()):
    user_books = user_rating_pos[user_rating_pos['ID'] == user_id]['Name'].values

    for i, name1 in enumerate(user_books):
        for j, name2 in enumerate(user_books, i+1):
            # pair_names = ' | '.join(sorted((name1, name2)))
            
            if name1 > name2:
                pair_names = (name1, name2)
            else:
                pair_names = (name2, name1)

            if book_pairs.get(pair_names) is None:
                book_pairs[pair_names] = 1
            else:
                book_pairs[pair_names] += 1

100%|██████████| 3703/3703 [01:52<00:00, 33.06it/s] 


In [27]:
for name in user_rating_pos['Name'].unique():
    del book_pairs[(name, name)]

Будем считать, что если книги хотя бы раз встречались вместе, то это 1, иначе 0.

In [30]:
pairs_total = 0
pairs_matched = 0

for name, sim_books in tqdm(books_dict.items()):
    
    for matched_name in sim_books:
        if name > matched_name:
            pair_names = (name, matched_name)
        else:
            pair_names = (matched_name, name)

        if book_pairs.get(pair_names) is not None:
            pairs_matched += 1
        pairs_total += 1

100%|██████████| 36519/36519 [00:00<00:00, 76836.07it/s]


In [56]:
print(f'Попадание пары книг у пользователя в оценках {round(pairs_matched / pairs_total, 4) * 100.} %')

Попадание пары книг у пользователя в оценках 3.9 %


Среднее значение Precision@10

In [36]:
single_book_dict = {}
for name1, name2 in book_pairs.keys():

    if single_book_dict.get(name1) is None:
        single_book_dict[name1] = [name2]
    else:
        single_book_dict[name1].append(name2)

    if single_book_dict.get(name2) is None:
        single_book_dict[name2] = [name1]
    else:
        single_book_dict[name2].append(name1)

In [64]:
single_book_dict['Freakonomics: A Rogue Economist Explores the Hidden Side of Everything'][:10]

["The Restaurant at the End of the Universe (Hitchhiker's Guide to the Galaxy, #2)",
 'Siddhartha',
 'The Hunger Games (The Hunger Games, #1)',
 'The Authoritative Calvin and Hobbes: A Calvin and Hobbes Treasury',
 'The Return of the Indian (The Indian in the Cupboard, #2)',
 'The Name of the Rose',
 'Dark Apprentice (Star Wars: The Jedi Academy Trilogy, #2)',
 'A Short History of Nearly Everything',
 'Angels & Demons (Robert Langdon, #1)',
 'The Return of the King (The Lord of the Rings, #3)']

In [48]:
# Посчитаем средний Precision@10
precision_sum = 0
precision_cnt = 0

for name, sim_books in tqdm(books_dict.items()):
    if single_book_dict.get(name) is None:
        continue
    
    K = min(10, len(single_book_dict[name]))
    intersetction = len(set(single_book_dict[name]).intersection(set(sim_books[:K])))

    precision_sum += float(intersetction) / K
    precision_cnt += 1

100%|██████████| 36519/36519 [00:00<00:00, 50440.03it/s]


In [59]:
print(f'Среднее значение Precision@10 = {round(precision_sum / precision_cnt, 3)}')

Среднее значение Precision@10 = 0.128
