## Please run this code only if you want to regenerate the embedding file

### Refer to pytorch documentation to find the pytorch version that matches with your CUDA version

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraModel
import pandas as pd
from transformers import logging

In [2]:
data = pd.read_csv("books_data.csv", nrows=40000)
data['Title'] = data['Title'].fillna('Unknown')
data['categories'] = data['categories'].fillna('Unknown')
data['description'] = data['description'].fillna('')
data['description'] = data['description'].apply(lambda x: x.lower())
data['book_content'] = (
    (data['Title'] + ' ') * 2
    + data['description'] + ' '
    + data['authors'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '') + ' '
    + data['categories'].apply(lambda x: ' '.join(x) * 5 if isinstance(x, list) else '')
)
data['book_content'] = data['book_content'].str.replace(r'[^\w\s]', '', regex=True).str.lower()

In [3]:
print(data['book_content'].sample(5))

17476    the rehabilitation of myth vicos new science t...
21562    the eighth night of creation the eighth night ...
19183    nicholas winton and the rescued generation sav...
37643    review of sports medicine  arthroscopy review ...
39353    shifra steins day trips from houston getaways ...
Name: book_content, dtype: object


In [4]:
logging.set_verbosity_error()
tokenizer = ElectraTokenizer.from_pretrained('google/electra-large-discriminator')
electra_model = ElectraModel.from_pretrained('google/electra-large-discriminator').cuda()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [5]:
def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts.tolist(),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

tokenized_data = tokenize_texts(data['book_content'], max_len=128)

In [6]:
def generate_electra_embeddings_in_batches(tokenized_data, electra_model, batch_size=32, device='cuda'):
    electra_model = electra_model.to(device)
    all_embeddings = []
    total_samples = tokenized_data['input_ids'].shape[0]

    for start_idx in range(0, total_samples, batch_size):
        end_idx = min(start_idx + batch_size, total_samples)
        input_ids_batch = tokenized_data['input_ids'][start_idx:end_idx].to(device)
        attention_mask_batch = tokenized_data['attention_mask'][start_idx:end_idx].to(device)

        with torch.no_grad():
            batch_embeddings = electra_model(input_ids=input_ids_batch, attention_mask=attention_mask_batch).last_hidden_state[:, 0, :]
        
        all_embeddings.append(batch_embeddings.cpu())
        torch.cuda.empty_cache()

    return torch.cat(all_embeddings, dim=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
electra_embeddings = generate_electra_embeddings_in_batches(tokenized_data, electra_model, batch_size=32, device=device)
print(electra_embeddings.shape)

torch.Size([40000, 1024])


In [7]:
class PairDataset(Dataset):
    def __init__(self, data, embeddings):
        self.data = data
        self.embeddings = embeddings

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        book1_emb = self.embeddings[idx]
        book2_emb = self.embeddings[(idx + 1) % len(self.data)]
        label = 1 if self.data['categories'].iloc[idx] == self.data['categories'].iloc[(idx + 1) % len(self.data)] else 0
        return book1_emb, book2_emb, label

In [8]:
class CustomBranch(nn.Module):
    def __init__(self):
        super(CustomBranch, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(256, 128)
        self.relu3 = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        return x

In [9]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn_branch = CustomBranch()
        self.fc1 = nn.Linear(1024, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64 * 2, 2)

    def forward_once(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        concatenated = torch.cat((output1, output2), dim=1)
        output = self.fc3(concatenated)
        return output

# Dataset and DataLoader
pair_dataset = PairDataset(data, electra_embeddings)
pair_loader = DataLoader(pair_dataset, batch_size=64, shuffle=True)

# Train Siamese Network
siamese_model = SiameseNetwork().cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(siamese_model.parameters(), lr=1e-5, weight_decay=1e-2)

In [10]:
epochs = 100
for epoch in range(epochs):
    siamese_model.train()
    running_loss = 0.0
    for batch in pair_loader:
        book1_emb, book2_emb, labels = batch
        book1_emb, book2_emb, labels = book1_emb.cuda(), book2_emb.cuda(), labels.cuda()

        optimizer.zero_grad()
        outputs = siamese_model(book1_emb, book2_emb)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss / len(pair_loader):.4f}')

# Save the trained model
torch.save(siamese_model.state_dict(), 'siamese_model.pth')


Epoch [1/100], Loss: 0.3910
Epoch [2/100], Loss: 0.1696
Epoch [3/100], Loss: 0.1680
Epoch [4/100], Loss: 0.1675
Epoch [5/100], Loss: 0.1670
Epoch [6/100], Loss: 0.1666
Epoch [7/100], Loss: 0.1663
Epoch [8/100], Loss: 0.1659
Epoch [9/100], Loss: 0.1656
Epoch [10/100], Loss: 0.1653
Epoch [11/100], Loss: 0.1649
Epoch [12/100], Loss: 0.1645
Epoch [13/100], Loss: 0.1642
Epoch [14/100], Loss: 0.1639
Epoch [15/100], Loss: 0.1637
Epoch [16/100], Loss: 0.1633
Epoch [17/100], Loss: 0.1631
Epoch [18/100], Loss: 0.1629
Epoch [19/100], Loss: 0.1625
Epoch [20/100], Loss: 0.1623
Epoch [21/100], Loss: 0.1620
Epoch [22/100], Loss: 0.1617
Epoch [23/100], Loss: 0.1616
Epoch [24/100], Loss: 0.1613
Epoch [25/100], Loss: 0.1611
Epoch [26/100], Loss: 0.1609
Epoch [27/100], Loss: 0.1607
Epoch [28/100], Loss: 0.1605
Epoch [29/100], Loss: 0.1603
Epoch [30/100], Loss: 0.1601
Epoch [31/100], Loss: 0.1599
Epoch [32/100], Loss: 0.1598
Epoch [33/100], Loss: 0.1595
Epoch [34/100], Loss: 0.1595
Epoch [35/100], Loss: 0

In [11]:
def extract_embeddings_from_model(electra_embeddings, siamese_model):
    siamese_model.eval()
    with torch.no_grad():
        book_embeddings = siamese_model.cnn_branch(electra_embeddings.cuda())
    return book_embeddings.cpu().detach().numpy()

book_embeddings = extract_embeddings_from_model(electra_embeddings, siamese_model)
torch.save(book_embeddings, 'electra_embeddings.pt')

### Dumps cosine similarities matrix

In [12]:
# import os
# import pickle

# folder_path = r'dumped_matrices/chebyshev_distance'
# os.makedirs(folder_path, exist_ok=True)

# # Save the matrix in chunks
# chunk_size = 2048
# num_chunks = len(manhattan_dist_matrix) // chunk_size + 1

# for i in range(num_chunks):
#     chunk = manhattan_dist_matrix[i * chunk_size: (i + 1) * chunk_size]
#     file_path = os.path.join(folder_path, f'chebyshev_matrix_chunk_{i}.pkl')
#     with open(file_path, 'wb') as f:
#         pickle.dump(chunk, f)
#     clear_output(wait=True)
#     print(f'Saved {i} / {num_chunks} chunks to {file_path}')