## Please run this code only if you want to regenerate the embedding file

### Refer to pytorch documentation to find the pytorch version that matches with your CUDA version

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd

from transformers import logging

In [None]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [3]:
print(data['book_content'].sample(5))

16801    the complete home guide to herbs natural heali...
23013    incredible voyage incredible voyage a welshman...
172      the mask of priam the mask of priam a collecti...
21648    passing on passing on passingon occurs when ha...
26235    saint of auschwitz story of maksymilian kolbe ...
Name: book_content, dtype: object


In [4]:
# Load the BERT tokenizer and model
logging.set_verbosity_error()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = BertModel.from_pretrained('bert-large-uncased').cuda()

In [5]:
def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

In [6]:
tokenized_data = tokenize_texts(data['book_content'], max_len=128)

In [7]:
def generate_bert_embeddings_in_batches(tokenized_data, bert_model, batch_size=32, device='cuda'):
    # Move the model to the specified device (GPU or CPU)
    bert_model = bert_model.to(device)
    
    # Initialize an empty list to store the embeddings
    all_embeddings = []
    
    # Calculate total batches
    total_samples = tokenized_data['input_ids'].shape[0]
    
    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        
        # Slice batch input_ids and attention_mask
        input_ids_batch = tokenized_data['input_ids'][start_idx:end_idx].to(device)  # Move to the same device
        attention_mask_batch = tokenized_data['attention_mask'][start_idx:end_idx].to(device)  # Move to the same device

        # Get BERT embeddings without computing gradients
        with torch.no_grad():
            batch_embeddings = bert_model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)[1]
        
        # Move embeddings back to CPU to save GPU memory
        all_embeddings.append(batch_embeddings.cpu())
        
        # Optionally clear cache to free memory
        torch.cuda.empty_cache()

    # Concatenate all batch embeddings into a single tensor
    return torch.cat(all_embeddings, dim=0)

# Use the function to generate embeddings in batches
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_embeddings = generate_bert_embeddings_in_batches(tokenized_data, bert_model, batch_size=32, device=device)


In [8]:
class PairDataset(Dataset):
    def __init__(self, data, bert_embeddings):
        self.data = data
        self.bert_embeddings = bert_embeddings

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        book1_emb = self.bert_embeddings[idx]
        book2_emb = self.bert_embeddings[(idx + 1) % len(self.data)]  # Pair with next item
        label = 1 if self.data['categories'].iloc[idx] == self.data['categories'].iloc[(idx + 1) % len(self.data)] else 0
        return book1_emb, book2_emb, label

In [9]:
class CustomBranch(nn.Module):
    def __init__(self):
        super(CustomBranch, self).__init__()
        self.fc1 = nn.Linear(1024, 512)  # First dense layer, increased number of units
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Dropout to prevent overfitting
        
        self.fc2 = nn.Linear(512, 256)  # Second dense layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)  # Dropout again

        self.fc3 = nn.Linear(256, 128)  # Third dense layer (matches original)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        
        return x

In [10]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn_branch = CustomBranch()
        self.fc1 = nn.Linear(1024, 128)  # Assuming BERT output size is 768
        self.fc2 = nn.Linear(128, 64)   # Reducing to 64 dimensions
        self.fc3 = nn.Linear(64 * 2, 2)  # Concatenating two 64-dim vectors, and output size 2 for binary classification

    def forward_once(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        concatenated = torch.cat((output1, output2), dim=1)
        output = self.fc3(concatenated)
        return output

# Dataset and DataLoader
pair_dataset = PairDataset(data, bert_embeddings)
pair_loader = DataLoader(pair_dataset, batch_size=64, shuffle=True)

# Initialize the Siamese model, loss function, and optimizer
siamese_model = SiameseNetwork().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(siamese_model.parameters(), lr=1e-5, weight_decay=1e-2)

In [11]:
epochs = 100
for epoch in range(epochs):
    siamese_model.train()
    running_loss = 0.0
    for batch in pair_loader:
        book1_emb, book2_emb, labels = batch
        book1_emb, book2_emb, labels = book1_emb.cuda(), book2_emb.cuda(), labels.cuda()

        optimizer.zero_grad()
        outputs = siamese_model(book1_emb, book2_emb)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(pair_loader):.4f}')

# Save the trained model
torch.save(siamese_model.state_dict(), 'siamese_model.pth')


Epoch [1/100], Loss: 0.2270
Epoch [2/100], Loss: 0.1659
Epoch [3/100], Loss: 0.1654
Epoch [4/100], Loss: 0.1651
Epoch [5/100], Loss: 0.1649
Epoch [6/100], Loss: 0.1648
Epoch [7/100], Loss: 0.1647
Epoch [8/100], Loss: 0.1646
Epoch [9/100], Loss: 0.1644
Epoch [10/100], Loss: 0.1647
Epoch [11/100], Loss: 0.1645
Epoch [12/100], Loss: 0.1643
Epoch [13/100], Loss: 0.1642
Epoch [14/100], Loss: 0.1642
Epoch [15/100], Loss: 0.1641
Epoch [16/100], Loss: 0.1641
Epoch [17/100], Loss: 0.1640
Epoch [18/100], Loss: 0.1641
Epoch [19/100], Loss: 0.1638
Epoch [20/100], Loss: 0.1639
Epoch [21/100], Loss: 0.1639
Epoch [22/100], Loss: 0.1637
Epoch [23/100], Loss: 0.1636
Epoch [24/100], Loss: 0.1636
Epoch [25/100], Loss: 0.1636
Epoch [26/100], Loss: 0.1636
Epoch [27/100], Loss: 0.1634
Epoch [28/100], Loss: 0.1634
Epoch [29/100], Loss: 0.1634
Epoch [30/100], Loss: 0.1633
Epoch [31/100], Loss: 0.1633
Epoch [32/100], Loss: 0.1632
Epoch [33/100], Loss: 0.1631
Epoch [34/100], Loss: 0.1631
Epoch [35/100], Loss: 0

In [12]:
# Extract book embeddings from the CNN branch of the Siamese model
def extract_embeddings_from_model(bert_embeddings, siamese_model):
    siamese_model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No gradient computation
        book_embeddings = siamese_model.cnn_branch(bert_embeddings.cuda())  # Pass through CNN branch
    return book_embeddings.cpu().detach().numpy()  # Move to CPU and detach from computation graph

# Use the function to get the processed embeddings
book_embeddings = extract_embeddings_from_model(bert_embeddings, siamese_model)

# Save the embeddings for future use
torch.save(book_embeddings, 'bert_embeddings.pt')

### Dumps cosine similarities matrix

In [13]:
# import os
# import pickle

# folder_path = r'dumped_matrices/chebyshev_distance'
# os.makedirs(folder_path, exist_ok=True)

# # Save the matrix in chunks
# chunk_size = 2048
# num_chunks = len(manhattan_dist_matrix) // chunk_size + 1

# for i in range(num_chunks):
#     chunk = manhattan_dist_matrix[i * chunk_size: (i + 1) * chunk_size]
#     file_path = os.path.join(folder_path, f'chebyshev_matrix_chunk_{i}.pkl')
#     with open(file_path, 'wb') as f:
#         pickle.dump(chunk, f)
#     clear_output(wait=True)
#     print(f'Saved {i} / {num_chunks} chunks to {file_path}')