In [None]:
%%capture
!pip install torch
!pip install numpy --upgrade
!pip install pandas
!pip install scikit-learn

In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = torch.load('../binaries/5k_steps.pt', map_location=torch.device('cpu'))

In [None]:
with open("../data/book_vectors.pickle", "rb") as handle:
    book_vectors = pickle.load(handle)

In [None]:
train_df = pd.read_csv("../data/train_data.csv")

In [None]:
book_to_index = {
    book_id: i for i, book_id in enumerate(train_df["book_id"].unique())
}

In [None]:
train_df.loc[train_df["title"].str.contains("lebron james", case=False)][["title", "book_id"]].drop_duplicates()

In [None]:
def compute_embedding(book_id):
    with torch.no_grad():
        return model.fc1(
            torch.tensor(book_vectors[book_to_index[book_id]].astype('float32').reshape(1, -1))
        )

In [None]:
# harry potter & another harry potter
cosine_similarity(compute_embedding(5), compute_embedding(3))

In [None]:
# harry potter & catch 22
cosine_similarity(compute_embedding(5), compute_embedding(255444))

In [None]:
# harry potter & lion witch wardrobe
cosine_similarity(compute_embedding(5), compute_embedding(100915))

In [None]:
# harry potter & the brothers grimm
cosine_similarity(compute_embedding(5), compute_embedding(13554713))

In [None]:
# harry potter & a lebron book
cosine_similarity(compute_embedding(5), compute_embedding(882588))

In [None]:
%%time
for vector in list(book_vectors.values())[:10000]:
    with torch.no_grad():
        book_tensor = torch.tensor(vector.astype('float32').reshape(1, -1))
        book_embedding = model.fc1(book_tensor)

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/train_data.csv')

In [4]:
df = df[['book_id', 'title']]

In [5]:
df.head()

Unnamed: 0,book_id,title
0,9712492,"Providence (Providence, #1)"
1,9712492,"Providence (Providence, #1)"
2,9712492,"Providence (Providence, #1)"
3,9712492,"Providence (Providence, #1)"
4,9712492,"Providence (Providence, #1)"


In [7]:
df = df.drop_duplicates('book_id').reset_index(drop=True)

In [8]:
df.head()

Unnamed: 0,book_id,title
0,9712492,"Providence (Providence, #1)"
1,11505797,"Beautiful Disaster (Beautiful, #1)"
2,11532160,"Requiem (Providence, #2)"
3,13417946,"Eden (Providence, #3)"
4,10140661,"Gabriel's Inferno (Gabriel's Inferno, #1)"


In [9]:
df['embedding_id'] = range(len(df))

In [10]:
df.head()

Unnamed: 0,book_id,title,embedding_id
0,9712492,"Providence (Providence, #1)",0
1,11505797,"Beautiful Disaster (Beautiful, #1)",1
2,11532160,"Requiem (Providence, #2)",2
3,13417946,"Eden (Providence, #3)",3
4,10140661,"Gabriel's Inferno (Gabriel's Inferno, #1)",4


In [11]:
df = df[['book_id', 'embedding_id', 'title']]

In [12]:
df.head()

Unnamed: 0,book_id,embedding_id,title
0,9712492,0,"Providence (Providence, #1)"
1,11505797,1,"Beautiful Disaster (Beautiful, #1)"
2,11532160,2,"Requiem (Providence, #2)"
3,13417946,3,"Eden (Providence, #3)"
4,10140661,4,"Gabriel's Inferno (Gabriel's Inferno, #1)"


In [43]:
df.to_csv('books_df.csv', index=False)

In [19]:
!pip install pickle5

Collecting pickle5
[?25l  Downloading https://files.pythonhosted.org/packages/91/2b/2dca5d1719d020cd055eb637cbd813e79171dc80210de8778502195bbc62/pickle5-0.0.12-cp37-cp37m-macosx_10_9_x86_64.whl (124kB)
[K    100% |████████████████████████████████| 133kB 4.6MB/s ta 0:00:01
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [32]:
import json
import pickle5 as pickle

In [21]:
with open("../data/book_vectors.pickle", "rb") as handle:
    book_vectors = pickle.load(handle)

In [24]:
min(book_vectors.keys())

0

In [25]:
max(book_vectors.keys())

519731

In [26]:
len(book_vectors)

519732

In [34]:
book_vec_list = []

for i in range(len(book_vectors)):
    book_vec_list.append(json.dumps(list(book_vectors[i])))

In [40]:
df['book_vector'] = book_vec_list

In [41]:
df.head()

Unnamed: 0,book_id,embedding_id,title,book_vector
0,9712492,0,"Providence (Providence, #1)","[0, 0.0, 0, 0.011195625503873251, 0, 0.8, 0.00..."
1,11505797,1,"Beautiful Disaster (Beautiful, #1)","[0, 0.0, 0, 0.15401871779592696, 1, 0.828, 0.0..."
2,11532160,2,"Requiem (Providence, #2)","[0, 0.0, 0, 0.004732027060184374, 0, 0.8300000..."
3,13417946,3,"Eden (Providence, #3)","[0, 0.0, 0, 0.005412036874759017, 0, 0.8340000..."
4,10140661,4,"Gabriel's Inferno (Gabriel's Inferno, #1)","[1, 1.0, 0, 0.04355568018507484, 0, 0.80399999..."


In [42]:
type(df['book_vector'][0])

str

In [39]:
book_vec_list[0]

'[0, 0.0, 0, 0.011195625503873251, 0, 0.8, 0.00038092134291703217, 0.004835544743686944, false, false, false, false, false, false, false, false, false, false, true, false, false, false, false, true, false, -1.3562464714050293, 1.1212412118911743, -2.3047139644622803, -1.0970467329025269, 3.304687738418579, 0.2997288703918457, 0.570357084274292, 5.383020401000977, 0.16782329976558685, -0.05038875713944435, 5.724740982055664, 1.1003310680389404, -2.285029649734497, 1.3047746419906616, 1.629544973373413, 1.3629673719406128, -0.5762142539024353, -1.3210035562515259, -0.9516798257827759, -1.4543696641921997, 0.9715679287910461, -0.5614798665046692, -0.550462543964386, -0.9130169153213501, -0.8245174884796143, -1.8662865161895752, -3.7481517791748047, 0.24189385771751404, 0.4625185430049896, 0.6747053861618042, 0.8302078247070312, -0.7133716344833374, -0.49939993023872375, -0.3534839451313019, -2.9900429248809814, -0.6217560172080994, -0.9875502586364746, 1.3178107738494873, -0.1016469225287

In [1]:
import torch

In [5]:
model = torch.load("../binaries/5k_steps.pt", map_location=torch.device("cpu"))

In [6]:
torch.save(model.state_dict(), "../binaries/5k_steps_state_dict.pt")