In [None]:
# Only run this once to install the necessary packages
%pip install torch
%pip install pandas
%pip install sklearn
%pip install scipy

In [1]:
import torch
import pandas as pd
from scipy.spatial.distance import cdist
from sklearn.preprocessing import normalize
from IPython.display import clear_output

In [2]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [None]:
book_embeddings = torch.load('deberta_embeddings.pt')
print(book_embeddings.shape)

In [4]:
normalized_book_embeddings = normalize(book_embeddings)

In [5]:
manhattan_dist_matrix = cdist(normalized_book_embeddings, normalized_book_embeddings, metric='cityblock') # Manhattan distance

In [6]:
def recommend_books_by_manhattan(book_title, threshold, manhattan_dist_matrix):
    idx = data[data['Title'] == book_title].index[0]
    
    # Compute the Manhattan distance scores
    dist_scores = list(enumerate(manhattan_dist_matrix[idx]))
    
    # Sort the books based on Manhattan distance (lower distance means more similar)
    dist_scores = sorted(dist_scores, key=lambda x: x[1], reverse=False)
    
    # Filter recommendations based on the threshold (optional)
    recommendations = [(data['Title'].iloc[i], "{:.5f}".format(score)) 
                       for i, score in dist_scores if score <= threshold]
    
    return recommendations

In [None]:
import os
import pickle

folder_path = r'dumped_matrices/manhattan'
os.makedirs(folder_path, exist_ok=True)

# Save the matrix in chunks
chunk_size = 2048
num_chunks = len(manhattan_dist_matrix) // chunk_size + 1

for i in range(num_chunks):
    chunk = manhattan_dist_matrix[i * chunk_size: (i + 1) * chunk_size]
    file_path = os.path.join(folder_path, f'manhattan_matrix_chunk_{i}.pkl')
    with open(file_path, 'wb') as f:
        pickle.dump(chunk, f)
    clear_output(wait=True)
    print(f'Saved {i} / {num_chunks} chunks to {file_path}')

In [None]:
book_title = ''
while (book_title != 'q'):
  clear_output(wait=True)
  book_title = input("Enter the title of a book: ")
  recommended_books = recommend_books_by_manhattan(book_title, 100.0, manhattan_dist_matrix)
  f = open('manhattan_output.txt', 'w')
  f.write('Counts: ' + str(len(recommended_books)) + '\n\n')
  for book in recommended_books:
    f.write(book[1] + ' | ' + str(book[0]) + '\n')

  print('Found: ' + str(len(recommended_books)))