In [1]:
import torch
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.preprocessing import normalize
from IPython.display import clear_output

In [2]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [3]:
book_embeddings = torch.load('bert_embeddings.pt')
print(book_embeddings.shape)

(40000, 128)


  book_embeddings = torch.load('bert_embeddings.pt')


In [4]:
normalized_book_embeddings = normalize(book_embeddings)

In [None]:
chebyshev_dist_matrix = cdist(normalized_book_embeddings, normalized_book_embeddings, metric='chebyshev')

In [8]:
def recommend_books_by_chebyshev(book_title, threshold, chebyshev_dist_matrix):
    idx = data[data['Title'] == book_title].index[0]
    
    # Compute the Chebyshev distance scores
    dist_scores = list(enumerate(chebyshev_dist_matrix[idx]))
    
    # Sort the books based on Chebyshev distance (lower distance means more similar)
    dist_scores = sorted(dist_scores, key=lambda x: x[1], reverse=False)
    
    # Filter recommendations based on the threshold (optional)
    recommendations = [(data['Title'].iloc[i], "{:.5f}".format(score)) 
                       for i, score in dist_scores if score <= threshold]
    
    return recommendations

In [None]:
book_title = ''
while (book_title != 'q'):
  clear_output(wait=True)
  book_title = input("Enter the title of a book: ")
  recommended_books = recommend_books_by_chebyshev(book_title, 1.0, chebyshev_dist_matrix)
  f = open('chebyshev_output.txt', 'w')
  f.write('Counts: ' + str(len(recommended_books)) + '\n\n')
  for book in recommended_books:
    f.write(book[1] + ' | ' + str(book[0]) + '\n')

  print('Found: ' + str(len(recommended_books)))

Found: 40000
