## **ESMC Embedding**

In [None]:
import pandas as pd
# Load Curve Fitted Data as pandas DataFrame
data = pd.read_csv("/projects/amp/asalehi/Dose/data/clean/full_clean_hts_with_fits_no_pooling.csv")

In [None]:
import pandas as pd
from itertools import combinations

experimental_data = (data[~data["Peptide_Name"].isin({"GrowthControl", "SterilityControl"})]
                     [["Data_ID","Peptide_Name", "Organism", "Sequence", "Amidation", "model", "i50", "slope", "Fitted_MIC"]])

In [None]:
# Only keep rows with E. faecalis ATCC 29212 as organism
experimental_data = experimental_data[experimental_data["Organism"] == "E. faecalis ATCC 29212"]

# Only keep Peptide_Name with Amidation == False
experimental_data = experimental_data[experimental_data["Amidation"] == False]

# Function to keep majority model per peptide
def keep_majority_model(data):
    """Keep majority model per peptide, preferring '2pl' on tie."""
    result = []
    for peptide, group in data.groupby("Peptide_Name"):
        if len(group["model"].unique()) == 1:
            result.append(group)
        else:
            counts = group["model"].value_counts()
            majority = "2pl" if counts.get("2pl", 0) >= counts.get("flat", 0) else "flat"
            result.append(group[group["model"] == majority])
    return pd.concat(result, ignore_index=True)

experimental_data = keep_majority_model(experimental_data)

# Count models
peptide_models = {p: set(g["model"]) for p, g in experimental_data.groupby("Peptide_Name")}
flat_only = sum(1 for m in peptide_models.values() if m == {"flat"})
pl_only = sum(1 for m in peptide_models.values() if m == {"2pl"})
mixed = sum(1 for m in peptide_models.values() if m == {"flat", "2pl"})

print(f"Flat only: {flat_only}, 2pl only: {pl_only}, Mixed: {mixed}")

Flat only: 270, 2pl only: 347, Mixed: 0


In [None]:
import os, torch
from tqdm.auto import tqdm
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig

organism = "E_faecalis_ATCC_29212"
save_dir = "/projects/amp/asalehi/Dose/embeddings"
os.makedirs(save_dir, exist_ok=True)

mean_path = f"{save_dir}/esmc_embeddings_{organism}_mean.pt"
full_path = f"{save_dir}/esmc_embeddings_{organism}_full.pt"

if os.path.exists(mean_path) and os.path.exists(full_path):
    print("Mean-pooled and full embeddings already exist.")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = ESMC.from_pretrained("esmc_600m").to(device).eval()
    config = LogitsConfig(sequence=True, return_embeddings=True, return_hidden_states=True)

    df = experimental_data[["Peptide_Name", "Sequence"]].drop_duplicates("Peptide_Name")

    mean_embs, full_embs = {}, {}
    for _, row in tqdm(df.iterrows(), total=len(df)):
        pep, seq = row.Peptide_Name, row.Sequence
        pt = model.encode(ESMProtein(sequence=seq)).to(device)
        with torch.no_grad():
            out = model.logits(pt, config)
        layers = torch.stack([h.squeeze(0) for h in out.hidden_states])  # (L, T, D)
        mean_embs[pep] = layers.mean(1).cpu()     # (L, D)
        full_embs[pep] = layers.cpu()             # (L, T, D)

    torch.save(mean_embs, mean_path)
    torch.save(full_embs, full_path)
    print("Saved:", mean_path, "and", full_path)


Mean-pooled and full embeddings already exist.
