# Phase 0.5 lookup ASIN from phase 0 title strings

In [1]:
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### Load and process data

In [2]:
df = pd.read_csv('../data/books_emb_8_tidy.csv').dropna()
df

Unnamed: 0,asin,title,0,1,2,3,4,5,6,7
0,080102563x,choosing the good: christian ethics in a compl...,-2.060097,0.704931,-0.306748,-0.152604,-0.561663,-0.601592,0.164375,0.883133
1,0801048869,beginning biblical hebrew: a grammar and illus...,-2.105482,0.698176,-0.207842,-0.029184,-1.306253,-0.800488,0.203011,1.014983
2,0310331366,five views on biblical inerrancy (counterpoint...,-1.680703,0.842620,-0.148897,-0.007306,-1.089476,-1.006622,0.263047,1.188486
3,0199751811,good god: the theistic foundations of morality,-1.165233,0.721346,-0.341000,-0.014560,-0.540818,-1.027589,0.261891,1.175774
4,0801020751,evangelical dictionary of theology (baker refe...,-1.529123,0.907699,0.054865,-0.365376,-0.530953,-0.455765,0.208765,1.073318
...,...,...,...,...,...,...,...,...,...,...
1948349,155143184x,hoop crazy (orca young readers),0.051233,0.002376,0.001270,0.004447,-0.048116,-0.013463,0.002934,-0.027920
1948352,0786670592,mel bay the mike marshall collection,-0.006899,-0.019441,0.053731,-0.056021,-0.033487,-0.047119,-0.010390,-0.041309
1948353,1592989985,mortal sin on my soul,-0.057517,0.051523,-0.035692,0.000907,0.053827,-0.010300,-0.045477,-0.038969
1948360,1935012061,power praying,-0.030925,0.015265,-0.055316,-0.005189,-0.022569,-0.005096,-0.059616,0.017718


In [3]:
# process the data
import string
import re

def normalize_whitespace(str):
    str = str.strip()
    str = re.sub(r'\s+', ' ', str)
    return str

df['processed_titles'] = [(normalize_whitespace(title).translate(str.maketrans('', '', string.punctuation))) for title in df['title']]
df['processed_titles'].head()

0    choosing the good christian ethics in a comple...
1    beginning biblical hebrew a grammar and illust...
2    five views on biblical inerrancy counterpoints...
3        good god the theistic foundations of morality
4    evangelical dictionary of theology baker refer...
Name: processed_titles, dtype: object

### Embed titles with a multilingual distilbert

In [4]:
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

In [8]:
# encode all the book title strings - takes about 8 min on my newish CPU
# embeddings_distilbert = model.encode(df['processed_titles'].values)

# I've saved these, so we can just load them
embeddings_distilbert = np.load("/home/jojo/Downloads/title_matching_embeddings.npy") # change the path to your download location

### Embed candidate string

In [83]:
query_strings = ["crime and punishment dostoevsky", "harry potter and the prisoner"]
query_embeddings = [model.encode([s]) for s in query_strings]

### Find entries similar to candidate

In [77]:
def return_k_similar(query_embeddings, reference_embeddings, k=1):
    res = np.array([])
    for query in query_embeddings:
        similarity_mat = cosine_similarity(query, reference_embeddings)
        similarity_score = similarity_mat[0]
        if k == 1:
            res = np.append(res, np.argmax(similarity_score).reshape(1, -1))
        elif k is not None:
            res = np.append(res, np.flip(similarity_score.argsort()[-k:][::1]).reshape(1, -1))
    return res

In [98]:
# k is number of titles to extract
def title_to_asin(query_strings, reference_embeddings, model, k=1) -> str:
    query_embeddings = [model.encode([s]) for s in query_strings]
    similar_item_idx = return_k_similar(query_embeddings, reference_embeddings, k)
    return df.iloc[similar_item_idx, ][['asin', 'processed_titles']]

In [100]:
title_to_asin(query_strings, embeddings_distilbert, model, k=1)

Unnamed: 0,asin,processed_titles
317533,1604596902,crime and punishment
1554500,8532512062,harry potter e o prisioneiro de azkaban


In [45]:
# np.save(file="title_matching_embeddings.npy", arr=embeddings_distilbert)

In [103]:
embeddings_distilbert.shape

(979707, 512)