In [1]:
import pandas as pd
import numpy as np

import pickle

In [2]:
with open('result/embeddings_v1.pkl', 'rb') as file:
    embeddings = pickle.load(file)

In [3]:
from pathlib import Path
full_dataset = pd.concat([pd.read_csv(str(x.resolve())) for x in Path("data/").glob("*k.csv")])

columns = ['Id', 'Name', 'RatingDist1', 'RatingDist2', 'RatingDist3',
           'RatingDist4', 'RatingDist5', 'Rating', 'RatingDistTotal', 'pagesNumber', 'Publisher',
           'Authors', 'Language', 'Description']
full_dataset = full_dataset[columns]

full_dataset = full_dataset[~full_dataset['Description'].isna()]
full_dataset['RatingDistTotal'] = full_dataset['RatingDistTotal'].apply(lambda x: float(x.replace('total:', '')))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

full_dataset = reduce_mem_usage(full_dataset)

In [4]:
import re

def text_preprocessing_step(text):
    text = text.lower()

    # Удаляем html-теги
    re_html = re.compile(r'<.*?>')
    text = re_html.sub(' ', text)

    return text.strip()

full_dataset = full_dataset[full_dataset['Description'].apply(len) >= 50]
full_dataset = full_dataset[full_dataset['RatingDistTotal'] >= 2000]

descriptions = full_dataset['Description'].apply(text_preprocessing_step)

In [79]:
full_dataset[['Name', 'Authors']].reset_index(drop=True)[:30]

Unnamed: 0,Name,Authors
0,Haroun and the Sea of Stories,Salman Rushdie
1,There and Back Again: An Actor's Tale,Sean Astin
2,Feu Pâle,Vladimir Nabokov
3,Anne of Green Gables,L.M. Montgomery
4,"Trojan Odyssey (Dirk Pitt, #17)",Clive Cussler
5,"Jackie & Me (A Baseball Card Adventure, #2)",Dan Gutman
6,Shenzhen: A Travelogue from China,Guy Delisle
7,"Honus & Me (A Baseball Card Adventure, #1)",Dan Gutman
8,Lighthouse at the End of the World: The First ...,Jules Verne
9,Lighthouse at the End of the World,Jules Verne


In [10]:
embeddings.shape[0]

66520

In [12]:
pair_distances = np.zeros((embeddings.shape[0], 10))

In [58]:
from sklearn.metrics.pairwise import cosine_distances

In [80]:
%%time

# idx = 66515
idx = 16

# distances = np.linalg.norm(embeddings - embeddings[idx], axis=1)
# distances = cosine_distances(embeddings, embeddings[idx].reshape(1, -1))
# distances = distances.reshape(1, -1)[0]

# Получается нормальный результат только со скалярным произведением
distances = np.dot(embeddings, embeddings[idx])

Wall time: 12.1 ms


In [81]:
indexes = np.argsort(distances)[::-1][:50]

In [74]:
indexes

array([  958, 55309,  9236, 18970,  6267, 58902, 16963, 41825, 62336,
       39348, 33432, 40396, 40328, 12791, 48138,  9338, 15723, 35759,
       43358, 34489, 31978, 64650, 13686, 29836, 41100, 15795, 21377,
       35802, 41543, 62867, 34067, 62837, 56814, 56813, 34453, 60261,
        2457,  2458, 44161, 34365, 15486, 17045, 44131, 32449, 60199,
        8879, 14667, 44267,  3189, 50051], dtype=int64)

In [82]:
tmp = full_dataset.iloc[indexes]
tmp[tmp['Language'] == 'eng']

Unnamed: 0,Id,Name,RatingDist1,RatingDist2,RatingDist3,RatingDist4,RatingDist5,Rating,RatingDistTotal,pagesNumber,Publisher,Authors,Language,Description
9342,921651,Just Go To Bed (A Golden Look-Look Book),1:395,2:914,3:4717,4:6926,5:13939,4.23,26891.0,24.0,Random House Books for Young Readers,Mercer Mayer,eng,Mercer Mayer's Little Critter is not looking f...
310236,2764081,There's A Nightmare In My Cupboard,1:143,2:429,3:2633,4:4454,5:7453,4.23,15112.0,,Puffin Books,Mercer Mayer,eng,The pesky nightmare in the closet won't stay i...
29774,1074339,The Napping House,1:690,2:1527,3:6282,4:11335,5:23099,4.27,42933.0,32.0,Harcourt Children's Books,Audrey Wood,eng,"A cozy bed, a snoring granny, a dreaming child..."
74375,2167752,"Beggars in Spain (Sleepless, #1)",1:144,2:390,3:1529,4:2899,5:2298,3.94,7260.0,,William Morrow,Nancy Kress,eng,"In the year 2008, thanks to a stunning scienti..."
282734,2688334,The Happiest Toddler on the Block: How to Elim...,1:183,2:778,3:2049,4:2054,5:1203,3.53,6267.0,,Bantam,Harvey Karp,eng,<b>Perfect for expecting parents who want to p...
208475,3588707,Junie B. Jones Has a Monster Under Her Bed (Ju...,1:236,2:537,3:1809,4:2001,5:3555,4.0,8138.0,,Turtleback Books,Barbara Park,eng,After hearing from a classmate at kindergarten...
254738,3719743,What to Expect the First Year,1:411,2:897,3:3838,4:5626,5:5131,3.89,15903.0,,Workman Publishing Company,Heidi Murkoff,eng,"Some things about babies, happily, will never ..."
21422,1949561,Have You Filled a Bucket Today?: A Guide to Da...,1:80,2:137,3:537,4:1128,5:2988,4.4,4870.0,,Ferne Press,Carol McCloud,eng,"Through sweet, simple prose and vivid illustra..."
18133,833046,I Am Not Going to Get up Today!,1:48,2:242,3:1120,4:1177,5:1753,4.0,4340.0,48.0,Random House Books for Young Readers,Dr. Seuss,eng,"""A rhyming story that is full of laughs. 'The ..."
364436,2911868,Parts,1:194,2:469,3:1903,4:3138,5:6716,4.27,12420.0,,Dial Books,Tedd Arnold,eng,I just don't know what's going onOr why it has...
