In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

#takes in the text and returns the embedings
def bert_embed(texts, batch_size=16, max_length=256):
    #Generate BERT embeddings for a list of texts using mean pooling.

    model.eval()
    embeddings = []

    with torch.no_grad():
        #for each text
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
            outputs = model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_state.size()).float()

            # Mean pooling
            pooled_embedding = torch.sum(last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
            embeddings.append(pooled_embedding.cpu().numpy())

    return np.vstack(embeddings)


In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
import pandas as pd

all_data = pd.DataFrame()

# combine data from each decades csv
for decade in range(1900, 2021, 10):
    csv_path = f"/content/drive/My Drive/new_book_data/books_data_{decade}.csv"
    csv_data = pd.read_csv(csv_path)
    all_data = pd.concat([all_data, csv_data], ignore_index=True)

# filter data to remove instances of missing descriptions or descriptions of length < 10 characters
all_data = all_data.dropna(subset=['description'])
all_data = all_data[all_data['description'].str.len() > 10]

# Remove text after "----------\nAlso contained in:" if it's in the description
all_data['description'] = all_data['description'].apply(lambda x: x.split("----------\nAlso contained in:")[0])

In [None]:
titles = all_data['title'].values.tolist()
descriptions = all_data['description'].values.tolist()

# here is where we get the embedings by just passing through
# the description texts of each podcast
embeddings = bert_embed(descriptions)

In [None]:
import numpy as np

embeddings_output_path = "/content/drive/MyDrive/new_book_data/book_description_bert_embeddings.npy"

# Save embeddings matrix to .npy
np.save(embeddings_output_path, embeddings)
print(f"Saved embeddings array to: {embeddings_output_path}")

# add column with embeddings to all_data and save to csv
all_data['embeddings'] = embeddings.tolist()
all_data.to_csv('/content/drive/My Drive/new_book_data/all_book_data.csv', index=False)

Saved embeddings array to: /content/drive/MyDrive/new_book_data/book_description_bert_embeddings.npy
