# BERT Embedding Extraction for Kickstarter Blurbs
This notebook loads cleaned campaign blurbs and extracts contextual embeddings using a fine-tuned BERT model.

In [None]:
import pickle
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

# Load preprocessed outputs
with open("preprocessing_outputs.pkl", "rb") as f:
    data = pickle.load(f)
    tokenizer_local = data['tokenizer']
    X_train, X_test = data['X_train'], data['X_test']
    y_train, y_test = data['y_train'], data['y_test']

In [None]:
# Initialize BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval();

In [None]:
# Utility function to extract embeddings
def extract_embeddings(texts, max_len=64):
    embeddings = []
    for text in tqdm(texts):
        encoded = bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
        with torch.no_grad():
            output = bert_model(**encoded, output_hidden_states=True)
            # Take the last 4 layers and concatenate them (as in the paper)
            hidden_states = output.hidden_states[-4:]
            token_embeddings = torch.cat(hidden_states, dim=-1)  # shape: (1, seq_len, 768*4)
            embeddings.append(token_embeddings.squeeze(0).numpy())
    return embeddings

In [None]:
# Convert padded indices back to text (using local tokenizer)
reverse_word_index = {v: k for k, v in tokenizer_local.word_index.items()}
reverse_word_index[0] = ""
def sequences_to_text(sequences):
    return [" ".join([reverse_word_index.get(idx, "") for idx in seq if idx != 0]) for seq in sequences]

# Prepare texts from sequences
train_texts = sequences_to_text(X_train[:100])  # Subset for speed; increase if needed
test_texts = sequences_to_text(X_test[:100])

In [None]:
# Extract BERT embeddings (last 4 layers concatenated)
train_embeddings = extract_embeddings(train_texts)
test_embeddings = extract_embeddings(test_texts)

# Save embeddings for next phase
with open("bert_embeddings.pkl", "wb") as f:
    pickle.dump({
        "train_embeddings": train_embeddings,
        "test_embeddings": test_embeddings,
        "y_train": y_train[:100],
        "y_test": y_test[:100]
    }, f)