# Roh-Deskriptoren → descriptor_physchem_raw.h5

1. Imports: Wir nutzen peptides für Deskriptoren, wandb zum Download, h5py zum Speichern.

2. get_descriptors: Extrahiert physico-chemische Deskriptoren oder liefert bei Fehlern {}.

3. W&B: Lädt das neueste beta_allele-Artifact herunter.

4. Daten laden: Train/Validation/Test als TSVs.

5. Filtern: Nur TCR, Epitope und Binding.

6. Deskriptoren extrahieren: Pro Sequenz mit Peptide(seq).descriptors().

7. DataFrame bauen: Zwei DataFrames (tcr_, epi_), zusammenführen und Label anhängen.

8. Mapping speichern: Optional, damit Du Zeile→(TCR,Epitope) nachschlagen kannst.

9. Arrays: tcr_arr, epi_arr, labels als NumPy Arrays.

10. HDF5: Unter den Keys "tcr_raw", "epi_raw" und "binding" gespeichert.

11. Abschluss: wandb.finish()

In [2]:
# 1) Imports
import pandas as pd
import numpy as np
import h5py
import wandb
from peptides import Peptide
import os

In [2]:
# 2) Hilfsfunktion zum Extrahieren der physico-chemischen Deskriptoren
def get_descriptors(seq):
    try:
        return Peptide(seq).descriptors()
    except Exception as e:
        # bei fehlerhaften Sequenzen einfach leeres Dict
        return {}

In [3]:
# 3) W&B initialisieren und Dataset-Artifact herunterladen
wandb.init(
    project="dataset-allele",
    entity="ba_cancerimmunotherapy",
    job_type="physchem_raw_export",
    name="raw_physchem_export"
)

dataset_name = "beta_allele"
artifact = wandb.use_artifact(f"ba_cancerimmunotherapy/dataset-allele/{dataset_name}:latest")
data_dir = artifact.download(f"./WnB_Experiments_Datasets/{dataset_name}")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-frohofer[0m ([33mba_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact beta_allele:latest, 1001.05MB. 46 files... 
[34m[1mwandb[0m:   46 of 46 files downloaded.  
Done. 0:0:1.2


In [4]:
# 4) CSVs laden (Train / Val / Test)
paths = {
    "train":      os.path.join(data_dir, "allele/train.tsv"),
    "validation": os.path.join(data_dir, "allele/validation.tsv"),
    "test":       os.path.join(data_dir, "allele/test.tsv"),
}

df_train = pd.read_csv(paths["train"],      sep="\t")
df_val   = pd.read_csv(paths["validation"], sep="\t")
df_test  = pd.read_csv(paths["test"],       sep="\t")

# in einem DataFrame zusammenführen
df_beta = pd.concat([df_train, df_val, df_test], ignore_index=True)
print(f"[INFO] Gesamt-Samples: {len(df_beta):,}")

  df_val   = pd.read_csv(paths["validation"], sep="\t")


[INFO] Gesamt-Samples: 993,078


In [5]:
# 5) Auf die benötigten Spalten reduzieren und fehlende Zeilen entfernen
df_physchem = df_beta[["TRB_CDR3", "Epitope", "Binding"]].dropna()
print(f"[INFO] Nach Dropna: {len(df_physchem):,} Samples")

[INFO] Nach Dropna: 993,078 Samples


In [None]:
# 6) Roh-Deskriptoren extrahieren
print("[INFO] Extrahiere TCR-Deskriptoren …")
tcr_desc = df_physchem["TRB_CDR3"].apply(get_descriptors)
print("[INFO] Extrahiere Epitope-Deskriptoren …")
epi_desc = df_physchem["Epitope"].apply(get_descriptors)

[INFO] Extrahiere TCR-Deskriptoren …
[INFO] Extrahiere Epitope-Deskriptoren …


In [None]:
# 7) In DataFrame umwandeln und zusammenführen
tcr_df  = pd.DataFrame(tcr_desc.tolist()).add_prefix("tcr_")
epi_df  = pd.DataFrame(epi_desc.tolist()).add_prefix("epi_")
desc_df = pd.concat([tcr_df, epi_df], axis=1)
desc_df["binding"] = df_physchem["Binding"].astype(np.float32).values

print(f"[INFO] Feature-Matrix: {desc_df.shape[0]}×{desc_df.shape[1]}")

In [None]:
# 8) Mapping-Datei speichern (optional, zum Nachschlagen)
mapping = df_physchem[["TRB_CDR3", "Epitope"]].copy()
mapping["idx"] = np.arange(len(mapping))
mapping_path = "../../../data/physico/ple/physchem_raw_mapping.tsv"

os.makedirs(os.path.dirname(mapping_path), exist_ok=True)

mapping.to_csv(mapping_path, sep="\t", index=False)
print(f"[INFO] Mapping gespeichert nach `{mapping_path}`")

In [None]:
# 9) Arrays erzeugen
tcr_arr  = desc_df.filter(like="tcr_").to_numpy(dtype=np.float32)
epi_arr  = desc_df.filter(like="epi_").to_numpy(dtype=np.float32)
labels   = desc_df["binding"].to_numpy(dtype=np.float32)

print(f"[INFO] tcr_arr shape = {tcr_arr.shape}")
print(f"[INFO] epi_arr shape = {epi_arr.shape}")
print(f"[INFO] labels  shape = {labels.shape}")

In [None]:
# 10) In HDF5 schreiben
output_path = "../../../data/physico/ple/descriptor_physchem_raw.h5"

os.makedirs(os.path.dirname(output_path), exist_ok=True)

with h5py.File(output_path, "w") as h5f:
    h5f.create_dataset("tcr_raw",   data=tcr_arr,  compression="gzip")
    h5f.create_dataset("epi_raw",   data=epi_arr,  compression="gzip")
    h5f.create_dataset("binding",   data=labels,   compression="gzip")
print(f"[INFO] Roh-Deskriptoren gespeichert in `{output_path}`")

In [None]:
# 11) Run beenden
wandb.finish()

# Autoencoder trainieren → descriptor_physchem_ple.h5

In [8]:
# 1) Setup & HDF5‐Check
# Lade raw physico‐chem Deskriptoren und prüfe, dass nichts leer ist.

import h5py
import numpy as np

# Pfad zu Deinem gerade erzeugten HDF5
RAW_H5 = "../../../data/physico/ple/descriptor_physchem_raw.h5"

with h5py.File(RAW_H5, "r") as f:
    tcr_raw = f["tcr_raw"][:]     # (N, D_tcr)
    epi_raw = f["epi_raw"][:]     # (N, D_epi)
    labels  = f["binding"][:]      # (N,)

print(f"tcr_raw shape: {tcr_raw.shape}")
print(f"epi_raw shape: {epi_raw.shape}")
print(f"labels   shape: {labels.shape}")

assert tcr_raw.size>0 and epi_raw.size>0 and labels.size>0, "🐞 mindestens eins Deiner Arrays ist leer!"


In [None]:
# 2) DataMatrix & Train/ValSplit
# Kombiniere TCR+Epi Deskriptoren und lege einen einfachen Dataset/Loader an.

import torch
from torch.utils.data import TensorDataset, DataLoader

# gesamte Feature‐Matrix
X = np.hstack([tcr_raw, epi_raw]).astype(np.float32)  # (N, D_total)

# Tensor‐Datasets
tensor_X = torch.from_numpy(X)
dataset = TensorDataset(tensor_X)  # nur X, unsupervised
loader  = DataLoader(dataset, batch_size=256, shuffle=True, drop_last=True)

In [None]:
# 3) Autoencoder‐Definition (PLE)
# Ein sehr einfacher Feed-Forward Autoencoder.  
# **latent_dim** kannst Du z.B. auf 64, 128, 256 etc. setzen.

import torch.nn as nn

input_dim  = X.shape[1]
hidden_dim = input_dim // 2
latent_dim = 128

class PLEAutoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
            nn.ReLU()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )
    def forward(self, x):
        z = self.encoder(x)
        x_rec = self.decoder(z)
        return x_rec

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PLEAutoencoder().to(device)


In [None]:
# 4) Training Loop
# Wir optimieren MSE zwischen Input und Rekonstruktion.

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

n_epochs = 30
for epoch in range(1, n_epochs+1):
    model.train()
    epoch_loss = 0.0
    for (batch_X,) in loader:
        batch_X = batch_X.to(device)
        optimizer.zero_grad()
        recon = model(batch_X)
        loss  = criterion(recon, batch_X)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_X.size(0)
    epoch_loss /= len(dataset)
    print(f"Epoch {epoch:02d}/{n_epochs}, MSE = {epoch_loss:.6f}")

In [None]:
# 5) Embedding extrahieren & speichern
# Nun erzeugen wir für **alle** Samples den Latent‐Code und speichern  
# (`ple_raw.h5`) mit `ple`, plus zum Synchronisieren wieder `binding`.

# Lade raw nochmal, um Reihenfolge exakt beizubehalten
with h5py.File(RAW_H5, "r") as f:
    tcr_raw = f["tcr_raw"][:]
    epi_raw = f["epi_raw"][:]
    labels  = f["binding"][:]

X_all = np.hstack([tcr_raw, epi_raw]).astype(np.float32)
model.eval()
with torch.no_grad():
    Z = model.encoder(torch.from_numpy(X_all).to(device)).cpu().numpy()  # (N, latent_dim)

# Ausgabe‐HDF5
PLE_H5 = "../../../data/physico/ple/descriptor_physchem_ple.h5"
with h5py.File(PLE_H5, "w") as f:
    f.create_dataset("ple",     data=Z,      compression="gzip")
    f.create_dataset("binding", data=labels, compression="gzip")

print(f" PLE‐Embedding gespeichert in `{PLE_H5}` mit shape {Z.shape}")
