In [18]:
#!pip install tensorflow

In [20]:
#Test
import numpy as np
import importlib
import ple_embedding as pe
importlib.reload(pe)

#bins definieren die "numerischen Buckets" für jedes physikochemische Feature
bins = [
    np.linspace(-4.5, 4.5, 6),    # start, stop, number of bins: hydrophobicity
    np.linspace(75, 200, 6),      # mol weight
    np.linspace(-1, 1, 6),        # charge
    np.linspace(-4.5, 4.5, 6),    # kyte-doolittle
    np.linspace(70, 200, 6),      # (dummy)
    np.linspace(0, 12, 6),        # (dummy)
]

seq = "CASSLGQETQYF"
encoded = pe.encode_sequence_with_full_PLE(seq, bins)
print("Shape:", encoded.shape)

# 6 Werte = 5 Intervalle = 5 PLE-Features pro Attribut, Bei 6 Features → 6 × 5 = 30-Dimensionaler Vektor pro Position

Shape: (12, 30)


In [25]:
import os
import pandas as pd
import numpy as np
import pickle
import ple_embedding as pe
import h5py

# --- Logging ---
def log(msg):
    print(f"[INFO] {msg}")

# --- Pfade definieren ---
precision = 'allele'
pipeline_data = '../../../data'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'

path_beta_test = f"{pipeline_data_splitted}/{precision}/beta/test.tsv"
path_beta_validation = f"{pipeline_data_splitted}/{precision}/beta/validation.tsv"
path_beta_train = f"{pipeline_data_splitted}/{precision}/beta/train.tsv"

log("Lade Datensätze...")
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
log(f"Datensätze geladen: {len(df_beta)} Einträge")

# --- Filter nur relevante Spalten ---
df_physchem = df_beta[["TRB_CDR3", "Epitope", "Binding"]].dropna()
log(f"Gefiltert: {len(df_physchem)} Einträge mit TRB_CDR3, Epitope und Binding")

# --- Bins definieren ---
bins = [
    np.linspace(-4.5, 4.5, 6), 
    np.linspace(75, 200, 6),
    np.linspace(-1, 1, 6),
    np.linspace(-4.5, 4.5, 6),
    np.linspace(70, 200, 6),
    np.linspace(0, 12, 6),
]

# --- PLE-Encoding ---
log("Berechne PLE-Features für TCR...")
df_physchem["tcr_ple"] = df_physchem["TRB_CDR3"].apply(lambda s: pe.encode_sequence_with_full_PLE(s, bins))

log("Berechne PLE-Features für Epitope...")
df_physchem["epi_ple"] = df_physchem["Epitope"].apply(lambda s: pe.encode_sequence_with_full_PLE(s, bins))

# --- Padding ---
log("Wende Padding auf PLE-Features an...")
df_physchem["tcr_ple_pad"] = df_physchem["tcr_ple"].apply(lambda m: pe.pad_encoded_sequences([m], max_len=43)[0])
df_physchem["epi_ple_pad"] = df_physchem["epi_ple"].apply(lambda m: pe.pad_encoded_sequences([m], max_len=43)[0])

# --- Speichern ---
output_path = f"{pipeline_data}/physico/ple_encoded_physchem.h5"
log(f"Speichere vorbereiteten DataFrame nach: {output_path}")
with h5py.File(output_path, "w") as h5f:
    h5f.create_dataset("tcr_ple", data=np.stack(df_physchem["tcr_ple_pad"].to_numpy()))
    h5f.create_dataset("epi_ple", data=np.stack(df_physchem["epi_ple_pad"].to_numpy()))
    h5f.create_dataset("binding", data=df_physchem["Binding"].to_numpy())
print(f"[INFO] Gespeichert als HDF5 unter: {output_path}")
log("✅ Alles abgeschlossen und gespeichert!")

[INFO] Lade Datensätze...
[INFO] Datensätze geladen: 543844 Einträge
[INFO] Gefiltert: 543844 Einträge mit TRB_CDR3, Epitope und Binding
[INFO] Berechne PLE-Features für TCR...
[INFO] Berechne PLE-Features für Epitope...
[INFO] Wende Padding auf PLE-Features an...
[INFO] Speichere vorbereiteten DataFrame nach: ../../../data/physico/ple_encoded_physchem.h5
[INFO] Gespeichert als HDF5 unter: ../../../data/physico/ple_encoded_physchem.h5
[INFO] ✅ Alles abgeschlossen und gespeichert!


In [26]:
import h5py

path = "../../../data/physico/ple_encoded_physchem.h5"

with h5py.File(path, "r") as h5f:
    print("Datasets in File:", list(h5f.keys()))

    tcr_shape = h5f["tcr_ple"].shape
    epi_shape = h5f["epi_ple"].shape
    label_shape = h5f["binding"].shape

    print(f"tcr_ple shape: {tcr_shape}")
    print(f"epi_ple shape: {epi_shape}")
    print(f"binding shape: {label_shape}")

Datasets in File: ['binding', 'epi_ple', 'tcr_ple']
tcr_ple shape: (543844, 43, 30)
epi_ple shape: (543844, 43, 30)
binding shape: (543844,)
