In [24]:
# Step 3: NLP Feature Extraction
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Load cleaned text
text_df = pd.read_csv("unified_cleaned_text.csv")  # ticker, year, clean_text

# Initialize FinBERT (for financial 10-K text)
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModel.from_pretrained("yiyanghkust/finbert-tone")

# Function to convert text to embedding
def embed_text(text):
    tokens = tokenizer(str(text), return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    embedding = outputs.last_hidden_state.mean(dim=1).numpy()  # mean pooling
    return embedding[0]

# Generate embeddings
embeddings = []
for i, txt in enumerate(text_df['clean_text']):
    embeddings.append(embed_text(txt))
    if (i+1) % 100 == 0:
        print(f"Processed {i+1}/{len(text_df)} texts")

X_text = np.array(embeddings)
np.save("X_text.npy", X_text)
print("✅ Saved FinBERT embeddings → X_text.npy, shape:", X_text.shape)


✅ Saved FinBERT embeddings → X_text.npy, shape: (4, 768)
