## Please run this code only if you want to regenerate the embedding file

### Refer to pytorch documentation to find the pytorch version that matches with your CUDA version

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
import pandas as pd

from transformers import logging

In [None]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [None]:
print(data['book_content'].sample(5))

In [None]:
# Load the BERT tokenizer and model
logging.set_verbosity_error()
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base")

roberta_model = roberta_model.cuda()

In [None]:
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

# Example usage
tokenized_data = tokenize_data(data['Title'].tolist(), tokenizer)

In [None]:
def generate_roberta_embeddings_in_batches(tokenized_data, roberta_model, batch_size=32):
    all_embeddings = []
    total_samples = tokenized_data['input_ids'].shape[0]

    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        input_ids_batch = tokenized_data['input_ids'][start_idx:end_idx].to('cuda')
        attention_mask_batch = tokenized_data['attention_mask'][start_idx:end_idx].to('cuda')

        with torch.no_grad():
            batch_embeddings = roberta_model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)[0]
            batch_embeddings = batch_embeddings[:, 0, :]  # Extract CLS token representation

        all_embeddings.append(batch_embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)

roberta_embeddings = generate_roberta_embeddings_in_batches(tokenized_data, roberta_model)
print(roberta_embeddings.shape)

In [None]:
class PairDataset(Dataset):
    def __init__(self, data, bert_embeddings):
        self.data = data
        self.bert_embeddings = bert_embeddings

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        book1_emb = self.bert_embeddings[idx]
        book2_emb = self.bert_embeddings[(idx + 1) % len(self.data)]  # Pair with next item
        label = 1 if self.data['categories'].iloc[idx] == self.data['categories'].iloc[(idx + 1) % len(self.data)] else 0
        return book1_emb, book2_emb, label

In [None]:
class CustomBranch(nn.Module):
    def __init__(self):
        super(CustomBranch, self).__init__()
        self.fc1 = nn.Linear(768, 512)  # First dense layer, increased number of units
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Dropout to prevent overfitting
        
        self.fc2 = nn.Linear(512, 256)  # Second dense layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)  # Dropout again

        self.fc3 = nn.Linear(256, 128)  # Third dense layer (matches original)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        
        return x

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn_branch = CustomBranch()
        self.fc1 = nn.Linear(768, 128)
        self.relu = nn.ReLU()
        
        self.fc2 = nn.Linear(128, 64)
        
        self.fc3 = nn.Linear(64 * 2, 2)

    def forward_once(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        
        concatenated = torch.cat((output1, output2), dim=1)

        output = self.fc3(concatenated)
        return output

# Dataset and DataLoader
pair_dataset = PairDataset(data, roberta_embeddings)
pair_loader = DataLoader(pair_dataset, batch_size=64, shuffle=True)

siamese_model = SiameseNetwork().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(siamese_model.parameters(), lr=1e-5, weight_decay=1e-2)

In [None]:
epochs = 100
for epoch in range(epochs):
    siamese_model.train()
    running_loss = 0.0
    for batch in pair_loader:
        book1_emb, book2_emb, labels = batch
        book1_emb, book2_emb, labels = book1_emb.cuda(), book2_emb.cuda(), labels.cuda()

        optimizer.zero_grad()
        outputs = siamese_model(book1_emb, book2_emb)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(pair_loader):.4f}')

# Save the trained model
torch.save(siamese_model.state_dict(), 'siamese_model.pth')


In [None]:
# Extract book embeddings from the CNN branch of the Siamese model
def extract_embeddings_from_model(roberta_embeddings, siamese_model):
    siamese_model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No gradient computation
        book_embeddings = siamese_model.cnn_branch(roberta_embeddings.cuda())  # Pass through CNN branch
    return book_embeddings.cpu().detach().numpy()  # Move to CPU and detach from computation graph

# Use the function to get the processed embeddings
book_embeddings = extract_embeddings_from_model(roberta_embeddings, siamese_model)

# Save the embeddings for future use
torch.save(book_embeddings, 'roberta_embeddings.pt')

### Dumps cosine similarities matrix

In [None]:
# import os
# import pickle

# folder_path = r'dumped_matrices/chebyshev_distance'
# os.makedirs(folder_path, exist_ok=True)

# # Save the matrix in chunks
# chunk_size = 2048
# num_chunks = len(manhattan_dist_matrix) // chunk_size + 1

# for i in range(num_chunks):
#     chunk = manhattan_dist_matrix[i * chunk_size: (i + 1) * chunk_size]
#     file_path = os.path.join(folder_path, f'chebyshev_matrix_chunk_{i}.pkl')
#     with open(file_path, 'wb') as f:
#         pickle.dump(chunk, f)
#     clear_output(wait=True)
#     print(f'Saved {i} / {num_chunks} chunks to {file_path}')