## Please run this code only if you want to regenerate the embedding file

### Refer to pytorch documentation to find the pytorch version that matches with your CUDA version

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaTokenizer, DebertaModel
import pandas as pd

from transformers import logging

In [2]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [3]:
print(data['book_content'].sample(5))

12454    sniper one iron brigade series sniper one iron...
4837     stranger from arizona stranger from arizona wh...
21165    don juan tenorio mr juan tenorio spanish editi...
24083    surviving sisters surviving sisters why do bro...
37168    familiar christmas fear familiar book 11 harle...
Name: book_content, dtype: object


In [4]:
logging.set_verbosity_error()
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
deberta_model = DebertaModel.from_pretrained('microsoft/deberta-base').cuda()

In [5]:
def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

# Example usage
tokenized_data = tokenize_texts(data['book_content'], max_len=128)

In [6]:
def generate_deberta_embeddings_in_batches(tokenized_data, deberta_model, batch_size=32, device='cuda'):
    # Move the model to the specified device (GPU or CPU)
    deberta_model = deberta_model.to(device)
    
    # Initialize an empty list to store the embeddings
    all_embeddings = []
    
    # Calculate total batches
    total_samples = tokenized_data['input_ids'].shape[0]
    
    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        
        # Slice batch input_ids and attention_mask
        input_ids_batch = tokenized_data['input_ids'][start_idx:end_idx].to(device)  # Move to the same device
        attention_mask_batch = tokenized_data['attention_mask'][start_idx:end_idx].to(device)  # Move to the same device

        # Get DeBERTa embeddings without computing gradients
        with torch.no_grad():
            batch_embeddings = deberta_model(input_ids=input_ids_batch, attention_mask=attention_mask_batch).last_hidden_state.mean(dim=1)
        
        # Move embeddings back to CPU to save GPU memory
        all_embeddings.append(batch_embeddings.cpu())
        
        # Optionally clear cache to free memory
        torch.cuda.empty_cache()

    # Concatenate all batch embeddings into a single tensor
    return torch.cat(all_embeddings, dim=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
deberta_embeddings = generate_deberta_embeddings_in_batches(tokenized_data, deberta_model, batch_size=32, device=device)
print(deberta_embeddings.shape)

torch.Size([40000, 768])


In [7]:
class PairDataset(Dataset):
    def __init__(self, data, deberta_embeddings):
        self.data = data
        self.deberta_embeddings = deberta_embeddings

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        book1_emb = self.deberta_embeddings[idx]
        book2_emb = self.deberta_embeddings[(idx + 1) % len(self.data)]  # Pair with next item
        label = 1 if self.data['categories'].iloc[idx] == self.data['categories'].iloc[(idx + 1) % len(self.data)] else 0
        return book1_emb, book2_emb, label

In [8]:
class CustomBranch(nn.Module):
    def __init__(self):
        super(CustomBranch, self).__init__()
        self.fc1 = nn.Linear(768, 512)  # First dense layer, increased number of units
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Dropout to prevent overfitting
        
        self.fc2 = nn.Linear(512, 256)  # Second dense layer
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)  # Dropout again

        self.fc3 = nn.Linear(256, 128)  # Third dense layer (matches original)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        
        return x

In [9]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn_branch = CustomBranch()
        self.fc1 = nn.Linear(768, 128)  # Assuming DeBERTa output size is 1024
        self.fc2 = nn.Linear(128, 64)   # Reducing to 64 dimensions
        self.fc3 = nn.Linear(64 * 2, 2)  # Concatenating two 64-dim vectors, and output size 2 for binary classification

    def forward_once(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        concatenated = torch.cat((output1, output2), dim=1)
        output = self.fc3(concatenated)
        return output

# Dataset and DataLoader
pair_dataset = PairDataset(data, deberta_embeddings)
pair_loader = DataLoader(pair_dataset, batch_size=64, shuffle=True)

# Initialize the Siamese model, loss function, and optimizer
siamese_model = SiameseNetwork().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(siamese_model.parameters(), lr=1e-5, weight_decay=1e-2)

In [10]:
epochs = 100
for epoch in range(epochs):
    siamese_model.train()
    running_loss = 0.0
    for batch in pair_loader:
        book1_emb, book2_emb, labels = batch
        book1_emb, book2_emb, labels = book1_emb.cuda(), book2_emb.cuda(), labels.cuda()

        optimizer.zero_grad()
        outputs = siamese_model(book1_emb, book2_emb)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(pair_loader):.4f}')

# Save the trained model
torch.save(siamese_model.state_dict(), 'siamese_model.pth')

Epoch [1/100], Loss: 0.3635
Epoch [2/100], Loss: 0.1696
Epoch [3/100], Loss: 0.1667
Epoch [4/100], Loss: 0.1657
Epoch [5/100], Loss: 0.1648
Epoch [6/100], Loss: 0.1639
Epoch [7/100], Loss: 0.1631
Epoch [8/100], Loss: 0.1623
Epoch [9/100], Loss: 0.1616
Epoch [10/100], Loss: 0.1610
Epoch [11/100], Loss: 0.1605
Epoch [12/100], Loss: 0.1601
Epoch [13/100], Loss: 0.1597
Epoch [14/100], Loss: 0.1594
Epoch [15/100], Loss: 0.1592
Epoch [16/100], Loss: 0.1589
Epoch [17/100], Loss: 0.1588
Epoch [18/100], Loss: 0.1586
Epoch [19/100], Loss: 0.1584
Epoch [20/100], Loss: 0.1582
Epoch [21/100], Loss: 0.1582
Epoch [22/100], Loss: 0.1581
Epoch [23/100], Loss: 0.1580
Epoch [24/100], Loss: 0.1578
Epoch [25/100], Loss: 0.1577
Epoch [26/100], Loss: 0.1576
Epoch [27/100], Loss: 0.1575
Epoch [28/100], Loss: 0.1574
Epoch [29/100], Loss: 0.1572
Epoch [30/100], Loss: 0.1571
Epoch [31/100], Loss: 0.1570
Epoch [32/100], Loss: 0.1569
Epoch [33/100], Loss: 0.1568
Epoch [34/100], Loss: 0.1566
Epoch [35/100], Loss: 0

In [11]:
def extract_embeddings_from_model(deberta_embeddings, siamese_model):
    siamese_model.eval()  # Set model to evaluation mode
    with torch.no_grad():  # No gradient computation
        book_embeddings = siamese_model.cnn_branch(deberta_embeddings.cuda())  # Pass through CNN branch
    return book_embeddings.cpu().detach().numpy()  # Move to CPU and detach from computation graph

# Use the function to get the processed embeddings
book_embeddings = extract_embeddings_from_model(deberta_embeddings, siamese_model)

# Save the embeddings for future use
torch.save(book_embeddings, 'deberta_embeddings.pt')

### Dumps cosine similarities matrix

In [12]:
# import os
# import pickle

# folder_path = r'dumped_matrices/chebyshev_distance'
# os.makedirs(folder_path, exist_ok=True)

# # Save the matrix in chunks
# chunk_size = 2048
# num_chunks = len(manhattan_dist_matrix) // chunk_size + 1

# for i in range(num_chunks):
#     chunk = manhattan_dist_matrix[i * chunk_size: (i + 1) * chunk_size]
#     file_path = os.path.join(folder_path, f'chebyshev_matrix_chunk_{i}.pkl')
#     with open(file_path, 'wb') as f:
#         pickle.dump(chunk, f)
#     clear_output(wait=True)
#     print(f'Saved {i} / {num_chunks} chunks to {file_path}')