In [1]:
import gzip, csv, json, re, itertools, random, time
from pathlib import Path

import torch
from torch.utils.data import DataLoader

from sentence_transformers import models, SentenceTransformer, InputExample, losses, util

import numpy as np
import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DATA_DIR      = Path("data/GSEA/external_gene_data/store!")
OUTPUT_DIR    = Path("output/model")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

GENE_FILE     = DATA_DIR / "rat_genes_consolidated.txt.gz"
PATHWAY_FILE  = DATA_DIR / "wikipathways_synonyms_Rattus_norvegicus.gmt.gz"

TRAIN_JSONL   = OUTPUT_DIR / "train.jsonl"
VAL_JSONL     = OUTPUT_DIR / "val.jsonl"
TEST_JSONL    = OUTPUT_DIR / "test.jsonl"

BASE_MODEL    = "michiyasunaga/BioLinkBERT-large"
OUTPUT_FOLDER = OUTPUT_DIR / "biolinkbert-large-simcse-rat"
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

BATCH_SIZE     = 256
EPOCHS         = 50
LEARNING_RATE  = 3e-5
WARMUP_RATIO   = 0.1
DEVICE

'cuda'

In [2]:
# # In[1]:
# import gzip
# import csv
# import json
# import re
# import itertools
# import random
# from pathlib import Path
#
# # ---------------- Parameters ----------------
# # Fraction of the data to use: 0.1 for 10%, 1.0 for 100%
# multiplier = 1
#
# # Set seed for reproducibility
# random.seed(42)
#
#
#
# # Regex to extract synonyms inside brackets
# bracket_re = re.compile(r"\[([^\]]+)\]")
#
# # ------------- Helper Functions -------------
#
# def add_pairs(pairs, texts):
#     """
#     Given a list of text entries, strip and add all unique positive pairs.
#     """
#     texts = [t.strip() for t in texts if t and str(t).strip()]
#     for a, b in itertools.combinations(set(texts), 2):
#         pairs.append((a, b, 1))
#
#
# def sample_splits(splits, multiplier):
#     """
#     If multiplier < 1.0, truncate each list to the given fraction;
#     otherwise return unchanged.
#     """
#     if multiplier >= 1.0:
#         return splits
#     sampled = {}
#     for name, items in splits.items():
#         k = int(len(items) * multiplier)
#         sampled[name] = items[:k]
#     return sampled
#
# # In[2]: Build positive and negative pairs
# pos_pairs = []
# with gzip.open(GENE_FILE, "rt") as fh:
#     rdr = csv.DictReader(fh, delimiter='\t')
#     for row in rdr:
#         add_pairs(pos_pairs, [
#             row.get("Gene stable ID", ""),
#             row.get("Gene name", ""),
#             row.get("Gene description", "")
#         ])
#
# with gzip.open(PATHWAY_FILE, "rt") as fh:
#     for line in fh:
#         if not line.strip():
#             continue
#         pathway = re.sub(r"\s+", " ", line.split("\t")[0]).strip()
#         for grp in bracket_re.findall(line):
#             syns = [g.strip() for g in grp.split(",") if g.strip()]
#             add_pairs(pos_pairs, syns)
#             for s in syns:
#                 pos_pairs.append((pathway, s, 1))
#
# # 3) Generate negative pairs
# all_texts = list({t for a, b, _ in pos_pairs for t in (a, b)})
# pos_set   = {(a, b) for a, b, _ in pos_pairs}
# neg_pairs = set()
# while len(neg_pairs) < len(pos_pairs):
#     a, b = random.sample(all_texts, 2)
#     if (a, b) in pos_set or (b, a) in pos_set:
#         continue
#     if (a, b) in neg_pairs or (b, a) in neg_pairs:
#         continue
#     neg_pairs.add((a, b))
# neg_pairs = [(a, b, 0) for a, b in neg_pairs]
#
# # 4) Merge, dedupe unordered, shuffle
# all_pairs = pos_pairs + neg_pairs
# uniq      = {}
# for a, b, label in all_pairs:
#     key = tuple(sorted((a, b)))
#     if key not in uniq:
#         uniq[key] = label
# unique_pairs = [(a, b, lbl) for (a, b), lbl in uniq.items()]
# random.shuffle(unique_pairs)
#
# # In[3]: Split into train/val/test
# N       = len(unique_pairs)
# n_test  = int(0.1 * N)
# n_rem   = N - n_test
# n_val   = n_rem // 9
#
# splits = {
#     "test":  unique_pairs[:n_test],
#     "val":   unique_pairs[n_test:n_test + n_val],
#     "train": unique_pairs[n_test + n_val:]
# }
#
# # Apply multiplier sampling
# splits = sample_splits(splits, multiplier)
#
# # Write out JSONL and print counts
# # No directories needed; files will be created in cwd
# for name, path in [("train", TRAIN_JSONL), ("val", VAL_JSONL), ("test", TEST_JSONL)]:
#     with path.open("w", encoding="utf-8") as f:
#         for a, b, label in splits[name]:
#             json.dump({"text1": a, "text2": b, "label": label}, f, ensure_ascii=False)
#             f.write("\n")
#
# print(f"✅ Train={len(splits['train']):,}  Val={len(splits['val']):,}  Test={len(splits['test']):,}")
import gzip, csv, json, re, itertools, random
from pathlib import Path

# … your imports, SEED, DATA_DIR, OUTPUT_DIR, etc. …

def add_pairs(pairs, texts):
    texts = [t.strip() for t in texts if t and str(t).strip()]
    for a, b in itertools.combinations(set(texts), 2):
        pairs.append((a, b, 1))

# 1) Build pos_pairs and neg_pairs as before
pos_pairs = []
with gzip.open(GENE_FILE, "rt") as fh:
    rdr = csv.DictReader(fh)
    for row in rdr:
        add_pairs(pos_pairs, [
            row["Gene stable ID"],
            row["Gene name"],
            row["Gene description"]
        ])

bracket_re = re.compile(r"\[([^\]]+)\]")
with gzip.open(PATHWAY_FILE, "rt") as fh:
    for line in fh:
        if not line.strip():
            continue
        pathway = re.sub(r"\s+", " ", line.split("\t")[0]).strip()
        for grp in bracket_re.findall(line):
            syns = [g.strip() for g in grp.split(",") if g.strip()]
            add_pairs(pos_pairs, syns)
            for s in syns:
                pos_pairs.append((pathway, s, 1))

all_texts = list({t for a, b, _ in pos_pairs for t in (a, b)})

pos_set = {(a, b) for a, b, _ in pos_pairs}
neg_pairs = set()
while len(neg_pairs) < len(pos_pairs):
    a, b = random.sample(all_texts, 2)
    if (a, b) in pos_set or (b, a) in pos_set: continue
    if (a, b) in neg_pairs or (b, a) in neg_pairs: continue
    neg_pairs.add((a, b))
neg_pairs = [(a, b, 0) for a, b in neg_pairs]

# 2) Merge positives and negatives
all_pairs = pos_pairs + neg_pairs

# 3) Deduplicate unordered:
uniq = {}
for a, b, label in all_pairs:
    key = tuple(sorted((a, b)))
    if key not in uniq:
        uniq[key] = label
# Rebuild a list of unique (a,b,label)
unique_pairs = [(a, b, lbl) for (a, b), lbl in uniq.items()]

# 4) Shuffle and split
random.shuffle(unique_pairs)
N      = len(unique_pairs)
n_test = int(0.1 * N)
n_rem  = N - n_test
n_val  = n_rem // 9
n_train= n_rem - n_val

splits = {
    "test":  unique_pairs[:n_test],
    "val":   unique_pairs[n_test:n_test+n_val],
    "train": unique_pairs[n_test+n_val:]
}

# 5) Write out
for name, path in [("train", TRAIN_JSONL),
                   ("val",   VAL_JSONL),
                   ("test",  TEST_JSONL)]:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for a, b, label in splits[name]:
            json.dump({"text1": a, "text2": b, "label": label}, f, ensure_ascii=False)
            f.write("\n")

print(f"✅ Train={len(splits['train']):,}  Val={len(splits['val']):,}  Test={len(splits['test']):,}")


✅ Train=180,795  Val=22,599  Test=22,599


In [3]:
#!/usr/bin/env python3
import json
from pathlib import Path

def load_pairs(path):
    """
    Load all (text1, text2) pairs from a .jsonl file,
    canonicalizing order so (A,B) == (B,A).
    Returns a set of tuple pairs.
    """
    pairs = set()
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            a, b = data["text1"], data["text2"]
            # sort the two so order doesn’t matter
            pair = tuple(sorted((a, b)))
            pairs.add(pair)
    return pairs

def report_overlap(s1, s2, name1, name2):
    overlap = s1 & s2
    if overlap:
        print(f"🔴 Overlap between {name1} and {name2}: {len(overlap)} shared pairs")
        for pair in list(overlap)[:10]:
            print("   ", pair)
    else:
        print(f"✅ No overlap between {name1} and {name2}")

def main():
    base = Path("./output/model/")
    files = {
        "train": base / "train.jsonl",
        "val":   base / "val.jsonl",
        "test":  base / "test.jsonl",
    }

    # Load
    print("Loading splits…")
    splits = {name: load_pairs(path) for name, path in files.items()}
    for name, s in splits.items():
        print(f"  {name}: {len(s)} unique pairs")

    print("\nChecking overlaps:")
    report_overlap(splits["train"], splits["val"],   "train", "val")
    report_overlap(splits["train"], splits["test"],  "train", "test")
    report_overlap(splits["val"],   splits["test"],  "val",   "test")

if __name__ == "__main__":
    main()


Loading splits…
  train: 180795 unique pairs
  val: 22599 unique pairs
  test: 22599 unique pairs

Checking overlaps:
✅ No overlap between train and val
✅ No overlap between train and test
✅ No overlap between val and test


In [4]:
from datasets import Dataset
from torch.utils.data import DataLoader
from sentence_transformers import (
    models,
    SentenceTransformer,
    InputExample,
    losses,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from transformers import EarlyStoppingCallback
import json

# ── 0) Re-create your SBERT model & loss ──
# (This was originally in Cell 3 of your first snippet)
word_model = models.Transformer(BASE_MODEL, max_seq_length=128)
word_model.auto_model.gradient_checkpointing_enable()
pooling = models.Pooling(
    word_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
model = SentenceTransformer(modules=[word_model, pooling], device=DEVICE)
loss = losses.ContrastiveLoss(model=model, margin=1.0)

# ── 1) Load JSONL splits and build InputExamples ──
train_data = [json.loads(l) for l in TRAIN_JSONL.open("r", encoding="utf-8")]
val_data   = [json.loads(l) for l in VAL_JSONL.open("r", encoding="utf-8")]

train_examples = [
    InputExample(texts=[d["text1"], d["text2"]], label=d["label"])
    for d in train_data
]
val_examples = [
    InputExample(texts=[d["text1"], d["text2"]], label=d["label"])
    for d in val_data
]

# ── 2) Build DataLoaders (for eval_steps/save_steps) ──
train_loader = DataLoader(
    train_examples,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=True
)
val_loader = DataLoader(
    val_examples,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False
)

# ── 3) Convert to HF Datasets for Trainer ──
train_ds = Dataset.from_list([
    {"text1": ex.texts[0], "text2": ex.texts[1], "label": ex.label}
    for ex in train_examples
])
val_ds = Dataset.from_list([
    {"text1": ex.texts[0], "text2": ex.texts[1], "label": ex.label}
    for ex in val_examples
])

# ── 4) Evaluator for validation ──
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples, name='val-eval'
)

# ── 5) Training arguments with HF-style eval/save & early stopping ──
training_args = SentenceTransformerTrainingArguments(
    output_dir=str(OUTPUT_FOLDER),
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    eval_strategy="steps",
    eval_steps=len(train_loader),
    save_strategy="steps",
    save_steps=len(train_loader),
    load_best_model_at_end=True,
    metric_for_best_model="eval_val-eval_spearman_cosine",  # ← match one of the returned keys
    greater_is_better=True,
)


# ── 6) Build the Trainer with EarlyStoppingCallback ──
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=loss,
    evaluator=evaluator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# ── 7) Kick off training ──
trainer.train()


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Val-eval Pearson Cosine,Val-eval Spearman Cosine
706,0.1278,0.115915,0.261697,0.274422
1412,0.1092,0.081464,0.594961,0.585891
2118,0.0805,0.076356,0.622305,0.6157
2824,0.0775,0.074519,0.633227,0.624896
3530,0.0739,0.074369,0.6339,0.623555
4236,0.0713,0.072679,0.644653,0.637608
4942,0.0695,0.072892,0.643792,0.636271
5648,0.0658,0.070503,0.658511,0.651232
6354,0.0636,0.070245,0.661976,0.653238
7060,0.0598,0.069852,0.663715,0.655729


TrainOutput(global_step=31064, training_loss=0.033217187819293933, metrics={'train_runtime': 64837.5292, 'train_samples_per_second': 139.422, 'train_steps_per_second': 0.545, 'total_flos': 0.0, 'train_loss': 0.033217187819293933, 'epoch': 43.937765205091935})

test_data = [json.loads(l) for l in TEST_JSONL.open("r", encoding="utf-8")]

t1     = [d['text1'] for d in test_data]
t2     = [d['text2'] for d in test_data]
labels = np.array([d['label'] for d in test_data])

word_model_base = models.Transformer(BASE_MODEL, max_seq_length=128)
word_model_base.auto_model.gradient_checkpointing_enable()
pool_base = models.Pooling(word_model_base.get_word_embedding_dimension(),
                           pooling_mode_mean_tokens=True)
base_model = SentenceTransformer(modules=[word_model_base, pool_base], device=DEVICE)
best_out_dir = OUTPUT_DIR / "best_model"            # <-- your target directory

# (Re-)create the destination directory if needed
if best_out_dir.exists():
    shutil.rmtree(best_out_dir)
best_out_dir.mkdir(parents=True, exist_ok=True)
fine_model = SentenceTransformer(str(best_out_dir), device=DEVICE)

batch_size_encode = 128
emb_b1 = base_model.encode(t1, batch_size=batch_size_encode,
                           convert_to_tensor=True, show_progress_bar=True)
emb_b2 = base_model.encode(t2, batch_size=batch_size_encode,
                           convert_to_tensor=True, show_progress_bar=False)
emb_f1 = fine_model.encode(t1, batch_size=batch_size_encode,
                           convert_to_tensor=True, show_progress_bar=False)
emb_f2 = fine_model.encode(t2, batch_size=batch_size_encode,
                           convert_to_tensor=True, show_progress_bar=False)

sims = {
    'Base':       util.cos_sim(emb_b1, emb_b2).diag().cpu().numpy(),
    'Fine-tuned': util.cos_sim(emb_f1, emb_f2).diag().cpu().numpy()
}

for name, arr in sims.items():
    true_vals  = arr[labels == 1]
    false_vals = arr[labels == 0]
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.violinplot([true_vals, false_vals], showmeans=True, showmedians=True)
    ax.set_title(f"{name} Similarity Distribution (n={len(test_data)})")
    ax.set_xticks([1, 2]); ax.set_xticklabels(["True", "False"])
    ax.set_ylabel("Cosine similarity")
    ax.grid(True, axis="y", linestyle="--", alpha=0.6)
    ax.set_ylim(-0.1, 1.1)
    plt.tight_layout()
    # Save figure using the number of epochs in filename
    save_path = OUTPUT_DIR/ "PNG" / f"{name.lower()}_{EPOCHS}_epochs.png"
    fig.savefig(save_path)
    print(f"Saved plot to {save_path}")
    plt.show()

In [5]:
import matplotlib.pyplot as plt
import json
from pathlib import Path

from sentence_transformers import models, SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# ── 0) Build the BioLinkBERT‐Large base model with mean pooling ──
word_model = models.Transformer(
    "michiyasunaga/BioLinkBERT-large",
    max_seq_length=128,
)
pooling = models.Pooling(
    word_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
base_model = SentenceTransformer(
    modules=[word_model, pooling],
    device=DEVICE
)

# ── 1) Prepare evaluators ──
store_dir = OUTPUT_FOLDER # / "store"

val_data = [json.loads(l) for l in VAL_JSONL.open("r", encoding="utf-8")]
val_examples = [
    InputExample(texts=[d["text1"], d["text2"]], label=d["label"])
    for d in val_data
]
evaluator_val = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples, name="val-eval"
)

test_data = [json.loads(l) for l in TEST_JSONL.open("r", encoding="utf-8")]
test_examples = [
    InputExample(texts=[d["text1"], d["text2"]], label=d["label"])
    for d in test_data
]
evaluator_test = EmbeddingSimilarityEvaluator.from_input_examples(
    test_examples, name="test-eval"
)

# ── 2) Locate all checkpoint directories ──
ckpts = sorted(
    [d for d in store_dir.iterdir() if d.is_dir() and d.name.startswith("checkpoint-")],
    key=lambda d: int(d.name.split("-")[1])
)

# ── 3) Evaluate base model + each checkpoint ──
epochs = []       # first entry will be your base model
val_scores = []
test_scores = []

# Base model
print("Evaluating base model…")
metrics_val = evaluator_val(base_model, output_path=None)
metrics_test = evaluator_test(base_model, output_path=None)
base_val = metrics_val["val-eval_spearman_cosine"]
base_test = metrics_test["test-eval_spearman_cosine"]
print(f"  [Base Model] Val Spearman = {base_val:.6f}, Test Spearman = {base_test:.6f}")
epochs.append(BASE_MODEL)
val_scores.append(base_val)
test_scores.append(base_test)

# Fine‐tuned checkpoints (labelled 1, 2, 3, …)
for idx, ckpt in enumerate(ckpts, start=1):
    print(f"Evaluating checkpoint at epoch {idx} ({ckpt.name})…")
    model = SentenceTransformer(str(ckpt), device=DEVICE)
    metrics_val = evaluator_val(model, output_path=None)
    metrics_test = evaluator_test(model, output_path=None)
    val_sp = metrics_val["val-eval_spearman_cosine"]
    test_sp = metrics_test["test-eval_spearman_cosine"]
    print(f"  [Epoch {idx}] Val Spearman = {val_sp:.6f}, Test Spearman = {test_sp:.6f}")
    epochs.append(idx)
    val_scores.append(val_sp)
    test_scores.append(test_sp)


# ── 4) Pick the best model on the validation set and save it ──
import shutil

best_val     = max(val_scores)
best_index   = val_scores.index(best_val)          # 0 ⇒ base model, ≥1 ⇒ ckpts[best_index-1]
best_out_dir = OUTPUT_DIR / "best_model"            # <-- your target directory

# (Re-)create the destination directory if needed
if best_out_dir.exists():
    shutil.rmtree(best_out_dir)
best_out_dir.mkdir(parents=True, exist_ok=True)

# Select and save
if best_index == 0:
    print(f"Best model is the unfine-tuned BASE model (val Spearman = {best_val:.6f})")
    base_model.save(str(best_out_dir))
else:
    best_ckpt = ckpts[best_index - 1]
    print(
        f"Best model is checkpoint {best_index} ({best_ckpt.name}) "
        f"(val Spearman = {best_val:.6f})"
    )
    # SentenceTransformer can save itself straight to a directory
    SentenceTransformer(str(best_ckpt)).save(str(best_out_dir))

print(f"✓ Saved best model to {best_out_dir.resolve()}")

Evaluating base model…
  [Base Model] Val Spearman = -0.077636, Test Spearman = -0.081831
Evaluating checkpoint at epoch 1 (checkpoint-11)…
  [Epoch 1] Val Spearman = -0.022254, Test Spearman = -0.030839
Evaluating checkpoint at epoch 2 (checkpoint-12)…
  [Epoch 2] Val Spearman = -0.021409, Test Spearman = -0.030774
Evaluating checkpoint at epoch 3 (checkpoint-22)…
  [Epoch 3] Val Spearman = -0.040527, Test Spearman = -0.048524
Evaluating checkpoint at epoch 4 (checkpoint-23)…
  [Epoch 4] Val Spearman = -0.039751, Test Spearman = -0.047810
Evaluating checkpoint at epoch 5 (checkpoint-706)…
  [Epoch 5] Val Spearman = 0.274422, Test Spearman = 0.266915
Evaluating checkpoint at epoch 6 (checkpoint-1412)…
  [Epoch 6] Val Spearman = 0.585891, Test Spearman = 0.574130
Evaluating checkpoint at epoch 7 (checkpoint-2118)…
  [Epoch 7] Val Spearman = 0.615700, Test Spearman = 0.602231
Evaluating checkpoint at epoch 8 (checkpoint-2824)…
  [Epoch 8] Val Spearman = 0.624896, Test Spearman = 0.612725

In [6]:
from pathlib import Path
import json

# ── Prepare output directory ──
output_dir = Path("./output/text_files/PNG_HTML")
output_dir.mkdir(parents=True, exist_ok=True)

# ── Save raw Spearman values for later reuse ──
values = {
    "epochs":     epochs,      # now matches Cell 1
    "val_scores": val_scores,
    "test_scores": test_scores
}
with open(output_dir / "spearman_values.json", "w") as f:
    json.dump(values, f)


In [7]:
from pathlib import Path
import plotly.graph_objects as go

# ── Prepare output directory ──
output_dir = Path("./output/text_files/PNG_HTML")
output_dir.mkdir(parents=True, exist_ok=True)

# Numeric positions and labels
positions = list(range(len(epochs)))
labels = [str(e) for e in epochs]  # e.g. "michiyasunaga/BioLinkBERT-large", "1", "2", …

# ── 1) Validation-only plot ──
fig_val = go.Figure()
fig_val.add_trace(go.Scatter(
    x=positions, y=val_scores, mode="lines+markers", name="Validation"
))
fig_val.update_layout(
    title="Validation Spearman–Cosine over Base Model + Epochs",
    xaxis=dict(
        title="Model / Epoch",
        tickmode="array",
        tickvals=positions,
        ticktext=labels,
        tickangle=45
    ),
    yaxis=dict(title="Spearman Cosine", range=[min(val_scores) - 0.02, 1.0])
)
val_path = output_dir / "spearman_validation.html"
fig_val.write_html(str(val_path), include_plotlyjs="cdn")

# ── 2) Test-only plot ──
fig_test = go.Figure()
fig_test.add_trace(go.Scatter(
    x=positions, y=test_scores, mode="lines+markers", name="Test"
))
fig_test.update_layout(
    title="Test Spearman–Cosine (in the wild) over Base Model + Epochs",
    xaxis=dict(
        title="Model / Epoch",
        tickmode="array",
        tickvals=positions,
        ticktext=labels,
        tickangle=45
    ),
    yaxis=dict(title="Spearman Cosine", range=[min(test_scores) - 0.02, 1.0])
)
test_path = output_dir / "spearman_test.html"
fig_test.write_html(str(test_path), include_plotlyjs="cdn")

# ── 3) Combined plot ──
fig_comb = go.Figure()
fig_comb.add_trace(go.Scatter(
    x=positions, y=val_scores, mode="lines+markers", name="Validation"
))
fig_comb.add_trace(go.Scatter(
    x=positions, y=test_scores, mode="lines+markers", name="Test"
))
fig_comb.update_layout(
    title="Validation & Test Spearman–Cosine over Base Model + Epochs",
    xaxis=dict(
        title="Model / Epoch",
        tickmode="array",
        tickvals=positions,
        ticktext=labels,
        tickangle=45
    ),
    yaxis=dict(title="Spearman Cosine", range=[min(val_scores + test_scores) - 0.02, 1.0])
)
comb_path = output_dir / "spearman_combined.html"
fig_comb.write_html(str(comb_path), include_plotlyjs="cdn")

print(f"Saved:\n  {val_path}\n  {test_path}\n  {comb_path}")


Saved:
  output\text_files\PNG_HTML\spearman_validation.html
  output\text_files\PNG_HTML\spearman_test.html
  output\text_files\PNG_HTML\spearman_combined.html


In [8]:
from huggingface_hub import HfApi
import os

api = HfApi(token=os.getenv("HF_TOKEN"))
api.upload_folder(
    folder_path="./output/model/best_model",
    repo_id="mghuibregtse/biolinkbert-large-simcse-rat",
    repo_type="model",
)


RepositoryNotFoundError: 401 Client Error. (Request ID: Root=1-68179ccf-44db6cf66dde4e89404d7bdf;e28b082a-5efb-46ef-bdae-f2e33a075aff)

Repository Not Found for url: https://huggingface.co/api/models/mghuibregtse/biolinkbert-large-simcse-rat/preupload/main.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.
Note: Creating a commit assumes that the repo already exists on the Huggingface Hub. Please use `create_repo` if it's not the case.