In [None]:

import json
import math
import random
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, InputExample
from sentence_transformers.evaluation import InformationRetrievalEvaluator

# === Config ===
MODEL_NAME = "BAAI/bge-large-en-v1.5"
TRIPLET_FILE = "data/us_gaap_triplet_training_data_filtered.jsonl"
OUTPUT_PATH = "fine_tuned_gaap_model"
EVAL_METRICS_PATH = "eval_metrics.csv"
BATCH_SIZE = 16
EPOCHS = 30
EVAL_STEPS = 0 # 0 runs eval at end of epoch
# EARLY_STOPPING_PATIENCE = 2
# MAX_TRIPLETS = 5000
# EVAL_SAMPLES = 1000
MAX_TRIPLETS = 164874  # Based on filtered triplet count
# MAX_TRIPLETS = 1000
EVAL_SAMPLES = int(0.2 * MAX_TRIPLETS)



device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# === Load Triplet Data ===
with open(TRIPLET_FILE, "r") as f:
    all_triplets = [json.loads(l) for l in f]

random.shuffle(all_triplets)
eval_triplets = all_triplets[:EVAL_SAMPLES]
train_triplets = all_triplets[EVAL_SAMPLES:MAX_TRIPLETS]

# === Build IR Evaluator with early stopping
queries = {}
corpus = {}
relevant_docs = {}

for i, triplet in enumerate(eval_triplets):
    qid = f"q{i}"
    pid = f"{qid}_pos"
    nid = f"{qid}_neg"
    queries[qid] = triplet["anchor"]
    corpus[pid] = triplet["positive"]
    corpus[nid] = triplet["negative"]
    relevant_docs[qid] = set([pid])

evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    show_progress_bar=True,
    name="gaap-ir-eval",
    write_csv=EVAL_METRICS_PATH,
)

# === Build training data ===
train_examples = [
    InputExample(texts=[t["anchor"], t["positive"], t["negative"]])
    for t in train_triplets
]

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
model = SentenceTransformer(MODEL_NAME, device=device)
train_loss = losses.TripletLoss(model)

steps_per_epoch = math.ceil(len(train_triplets) / BATCH_SIZE)
total_steps = steps_per_epoch * EPOCHS
print(f"📈 Expected total training steps: {total_steps}")

WARMUP_STEPS = int(0.20 * total_steps)

# === Train ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=EPOCHS,
    evaluation_steps=EVAL_STEPS,
    output_path=OUTPUT_PATH,
    show_progress_bar=True,
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
)

print(f"✅ Fine-tuned model saved to: {OUTPUT_PATH}")
print(f"📊 Evaluation metrics saved to: {EVAL_METRICS_PATH}")


In [None]:
import json
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm

# Load your trained or in-training model
model = SentenceTransformer("fine_tuned_gaap_model")  # or your in-progress directory

# Load a few triplets from file
with open("data/us_gaap_triplet_training_data.jsonl", "r") as f:
    triplets = [json.loads(line.strip()) for line in f]

# Define cosine similarity function
def cosine(u, v):
    return dot(u, v) / (norm(u) * norm(v))

# Sample a few triplets
for t in triplets[:5]:
    a, p, n = t["anchor"], t["positive"], t["negative"]
    a_vec = model.encode(a)
    p_vec = model.encode(p)
    n_vec = model.encode(n)

    sim_pos = cosine(a_vec, p_vec)
    sim_neg = cosine(a_vec, n_vec)

    print(f"Anchor:   {a}")
    print(f"Positive: {p}  | similarity: {sim_pos:.4f}")
    print(f"Negative: {n}  | similarity: {sim_neg:.4f}")
    print(f"✓ Pos > Neg? {'✅' if sim_pos > sim_neg else '❌'}\n")
