In [None]:
import pandas as pd
import numpy as np
import json
from peptides import Peptide
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import wandb

# --- Logging ---
def log(msg):
    print(f"[INFO] {msg}")

# --- W&B initialisieren ---
dataset_name = "beta_allele"
wandb.init(
    project="dataset-allele",
    entity="ba_cancerimmunotherapy",
    job_type="physchem_selection",
    name="feature_selection_run"
)

# --- Artifact herunterladen ---
artifact = wandb.use_artifact("ba_cancerimmunotherapy/dataset-allele/beta_allele:latest")
data_dir = artifact.download(f"./WnB_Experiments_Datasets/{dataset_name}")

# --- Dateipfade definieren ---
path_beta_test = f"{data_dir}/allele/test.tsv"
path_beta_validation = f"{data_dir}/allele/validation.tsv"
path_beta_train = f"{data_dir}/allele/train.tsv"

# --- Daten laden ---
log("Lade Datensätze...")
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
log(f"Datensätze geladen: {len(df_beta)} Einträge")

# --- Relevante Spalten extrahieren ---
df_physchem = df_beta[["TRB_CDR3", "Epitope", "Binding"]].dropna()
log(f"Gefiltert: {len(df_physchem)} Einträge mit TRB_CDR3, Epitope und Binding")

# --- Deskriptoren extrahieren ---
def get_descriptors(seq):
    try:
        return Peptide(seq).descriptors()
    except:
        return {}

log("Extrahiere Deskriptoren für TCR...")
tcr_desc = df_physchem["TRB_CDR3"].apply(get_descriptors)
log("Extrahiere Deskriptoren für Epitope...")
epi_desc = df_physchem["Epitope"].apply(get_descriptors)

# --- Kombinieren & Label anhängen ---
tcr_df = pd.DataFrame(tcr_desc.tolist()).add_prefix("tcr_")
epi_df = pd.DataFrame(epi_desc.tolist()).add_prefix("epi_")
desc_df = pd.concat([tcr_df, epi_df], axis=1)
desc_df["label"] = df_physchem["Binding"].values

# --- Feature-Matrix & Zielvariable ---
X = desc_df.drop(columns=["label"])
y = desc_df["label"]

# --- Skalieren ---
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# --- Feature Importance berechnen ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

importances = pd.Series(rf.feature_importances_, index=X.columns)
top_n = 20
top_features = importances.sort_values(ascending=False).head(top_n)
selected_features = list(top_features.index)

# --- Speichern der Top-N Features ---
with open("selected_physchem_features.json", "w") as f:
    json.dump(selected_features, f, indent=2)

print("\nTop Features:")
print(top_features)
print("\n✅ Gespeichert als: selected_physchem_features.json")

In [None]:
import os
import pandas as pd
import numpy as np
import json
import h5py
import wandb
import importlib
import ple_embedding as pe
#importlib.reload(pe)

# --- Logging ---
def log(msg):
    print(f"[INFO] {msg}")

# --- W&B initialisieren ---
dataset_name = "beta_allele"
wandb.init(
    project="dataset-allele",
    entity="ba_cancerimmunotherapy",
    job_type="physchem_encoding",
    name="descriptor_feature_extraction"
)

# --- Artifact herunterladen ---
artifact = wandb.use_artifact("ba_cancerimmunotherapy/dataset-allele/beta_allele:latest")
data_dir = artifact.download(f"./WnB_Experiments_Datasets/{dataset_name}")

# --- Dateipfade definieren ---
path_beta_test = f"{data_dir}/allele/test.tsv"
path_beta_validation = f"{data_dir}/allele/validation.tsv"
path_beta_train = f"{data_dir}/allele/train.tsv"

# --- Daten laden ---
log("Lade Datensätze...")
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
log(f"Datensätze geladen: {len(df_beta)} Einträge")

# --- Relevante Spalten extrahieren ---
df_physchem = df_beta[["TRB_CDR3", "Epitope", "Binding"]].dropna()
log(f"Gefiltert: {len(df_physchem)} Einträge mit TRB_CDR3, Epitope und Binding")

# --- Features laden & trennen ---
with open("selected_physchem_features.json") as f:
    all_features = json.load(f)

tcr_features = [f.replace("tcr_", "") for f in all_features if f.startswith("tcr_")]
epi_features = [f.replace("epi_", "") for f in all_features if f.startswith("epi_")]

# --- Feature-Encoding ---
log("Berechne Deskriptor-Features für TCR...")
df_physchem["tcr_encoded"] = df_physchem["TRB_CDR3"].apply(lambda s: pe.encode_descriptor_sequence(s, tcr_features))

log("Berechne Deskriptor-Features für Epitope...")
df_physchem["epi_encoded"] = df_physchem["Epitope"].apply(lambda s: pe.encode_descriptor_sequence(s, epi_features))

# --- Padding (optional, falls Sequenzen als Vektor statt Sequenzmatrix) ---
tcr_arr = np.stack(df_physchem["tcr_encoded"].to_numpy())
epi_arr = np.stack(df_physchem["epi_encoded"].to_numpy())
binding_arr = df_physchem["Binding"].to_numpy()

# --- HDF5 speichern ---
output_path = "../../data/physico/descriptor_encoded_physchem.h5"
log(f"Speichere vorbereiteten DataFrame nach: {output_path}")
with h5py.File(output_path, "w") as h5f:
    h5f.create_dataset("tcr_encoded", data=tcr_arr)
    h5f.create_dataset("epi_encoded", data=epi_arr)
    h5f.create_dataset("binding", data=binding_arr)
print(f"[INFO] Gespeichert als HDF5 unter: {output_path}")

# --- Abschließen ---
log("✅ Alles abgeschlossen und gespeichert!")
wandb.finish()

In [1]:
import json

with open("selected_physchem_features.json", "r") as f:
    features = json.load(f)

print("✅ Geladene Features:")
for i, feat in enumerate(features, 1):
    print(f"{i:2d}. {feat}")


✅ Geladene Features:
 1. epi_SVGER2
 2. epi_VHSE5
 3. epi_ProtFP3
 4. epi_T4
 5. epi_BLOSUM9
 6. epi_Z4
 7. epi_MSWHIM3
 8. epi_BLOSUM10
 9. epi_PRIN1
10. epi_ST7
11. epi_KF1
12. epi_ProtFP1
13. epi_F2
14. epi_Z3
15. epi_PP3
16. epi_VSTPV3
17. epi_Z1
18. epi_E4
19. tcr_BLOSUM9
20. epi_MSWHIM2
