### Import necessary libraries

In [None]:
import os
import sys

import numpy as np
import pandas as pd

import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

# Adding 'src' directory to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(os.path.join(project_root, 'src'))

from my_classes import SBERTWithClassifier

### Load raw csv and rename columns

In [None]:
df = pd.read_csv("../data/raw/1_10_seasons_tbbt.csv", delimiter=',', encoding='utf8') 
df.rename(columns = {'person_scene':'Person', 'dialogue':'Said'}, inplace = True)
df['Season'] = df['episode_name'].str.extract(r'Series (\d{2})').astype(int)
df = df[['Person', 'Said', 'Season']]

# Remove "Scene" and "(off)"
df = df[~df['Person'].isin(['Scene', '(off)'])]

# Replace "Cooper" with "Mary"
df['Person'] = df['Person'].replace({'Cooper': 'Mary'})

print(df.head())

### Keep only main characters

In [None]:
persons = ['Sheldon', 'Leonard', 'Raj', 'Penny','Howard','Amy','Bernadette']
data = df[df.Person.isin(persons)]
print(len(data), "dialogues for main characters")

### Drop empty line

In [None]:
data = data.dropna()
data = data.reset_index(drop=True)
print(len(data), "dialogues for main characters after dropping empty lines")

### Load best model (Last-Layer with weighted_cross_entropy and wd=1e-4)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

base_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

model = SBERTWithClassifier(base_model, num_classes=7, dropout_rate=0.1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(torch.load("../models/last_layer/all/weighted_cross_entropy/last_layer_0.0001_0.0001.pt", map_location=device))
model.to(device)

### Compute new embeddings

In [None]:
model.eval()
embeddings_list = []

# Extract embeddings in batches
for i in tqdm(range(0, len(data), 32)):
    batch_texts = data["Said"].iloc[i:i+32].tolist()
    encoded = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model.sbert(**encoded)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embeddings
        embeddings_list.append(batch_embeddings)

embeddings = np.vstack(embeddings_list)

# Build a new DataFrame with the embeddings
new_df = pd.DataFrame({
    "Person": data["Person"].values,
    "Said": data["Said"].values,
    "Season": data["Season"].values,
    "Embedding": list(embeddings)
})

# Save the DataFrame to a pickle file
new_df.to_pickle("../data/processed/sfinetuned_embeddings.pkl")
