Bibliotheken

In [1]:
import os
import numpy as np
import subprocess
from sklearn.manifold import Isomap
from sklearn.preprocessing import StandardScaler
import cupy as cp
import torch
import pandas as pd
import h5py

GPU

In [2]:
torch.cuda.empty_cache()
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Pfad

In [3]:
# Lade die Dateien
beta_all_epi = np.load('../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz')
beta_all_trb = np.load('../../data/embeddings/beta/allele/TRB_beta_embeddings.npz')


***CHECK VOR DIMENSION REDUCTION***

Keys, Shape, Dimension von Embeddings

In [4]:
# Erhalte die Keys als Liste
epi_keys = list(beta_all_epi.keys())
trb_keys = list(beta_all_trb.keys())

# Drucke die ersten 5 und letzten 5 Keys
print(f"🔢 Anzahl der Keys in Epitope_beta_embeddings: {len(epi_keys)}")
print(f"📌 Erste 5 Keys: {epi_keys[:5]}")
print(f"📌 Letzte 5 Keys: {epi_keys[-5:]}")

print(f"🔢 Anzahl der Keys in TRB_beta_embeddings: {len(trb_keys)}")
print(f"📌 Erste 5 Keys: {trb_keys[:5]}")
print(f"📌 Letzte 5 Keys: {trb_keys[-5:]}")

# Prüfe die Form der ersten & letzten 5 Keys für beide Embeddings
print("\n🔹 Shapes der ersten 5 Epitope Keys:")
for key in epi_keys[:5]:
    print(f"  {key}: {beta_all_epi[key].shape}")

print("\n🔹 Shapes der letzten 5 Epitope Keys:")
for key in epi_keys[-5:]:
    print(f"  {key}: {beta_all_epi[key].shape}")

print("\n🔹 Shapes der ersten 5 TRB Keys:")
for key in trb_keys[:5]:
    print(f"  {key}: {beta_all_trb[key].shape}")

print("\n🔹 Shapes der letzten 5 TRB Keys:")
for key in trb_keys[-5:]:
    print(f"  {key}: {beta_all_trb[key].shape}")


🔢 Anzahl der Keys in Epitope_beta_embeddings: 1896
📌 Erste 5 Keys: ['KLVVLGINAV', 'GPGHKARVL', 'FRYMNSQGL', 'NMLSTVLGV', 'DSFKEELDKY']
📌 Letzte 5 Keys: ['ISQWLTNIF', 'SILDAVQRV', 'RLMKHYPGI', 'YLKLTDNVYIK', 'VSALSRAAEK']
🔢 Anzahl der Keys in TRB_beta_embeddings: 208722
📌 Erste 5 Keys: ['CASSQDNNEQFF', 'CASSVISRVGETQYF', 'CASNPTDGGETQYF', 'CASSGSYAPGADTQYF', 'CASSYPGQNNSPLHF']
📌 Letzte 5 Keys: ['CASSLSQGSYSTDTQYF', 'CASSSSGTTKQPQHF', 'CAGRHLLEAFF', 'CASSVDLSSYNEQFF', 'CASSSGLAGLGEQFF']

🔹 Shapes der ersten 5 Epitope Keys:
  KLVVLGINAV: (10, 1024)
  GPGHKARVL: (9, 1024)
  FRYMNSQGL: (9, 1024)
  NMLSTVLGV: (9, 1024)
  DSFKEELDKY: (10, 1024)

🔹 Shapes der letzten 5 Epitope Keys:
  ISQWLTNIF: (9, 1024)
  SILDAVQRV: (9, 1024)
  RLMKHYPGI: (9, 1024)
  YLKLTDNVYIK: (11, 1024)
  VSALSRAAEK: (10, 1024)

🔹 Shapes der ersten 5 TRB Keys:
  CASSQDNNEQFF: (12, 1024)
  CASSVISRVGETQYF: (15, 1024)
  CASNPTDGGETQYF: (14, 1024)
  CASSGSYAPGADTQYF: (16, 1024)
  CASSYPGQNNSPLHF: (15, 1024)

🔹 Shapes der le

***ISOMAP***

sklearn mit GPU-optimierter Berechnung durch CuPy & PyTorch

In [5]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # Das erlaubt PyTorch, Speicher besser zu verwalten und Fragmentierung zu vermeiden

In [6]:
import torch
torch.cuda.empty_cache()

In [7]:
def apply_isomap_gpu_preprocessed(input_path, output_path, n_components=512, n_neighbors=10):
    """
    Nutzt GPU für Datenvorbereitung (Normalisierung mit Cupy/PyTorch), aber Isomap läuft auf CPU.
    """
    print(f"📂 Lade Datei: {input_path}")
    data = np.load(input_path, allow_pickle=True)

    isomap = Isomap(n_neighbors=n_neighbors, n_components=n_components)
    scaler = StandardScaler()

    all_embeddings = []
    key_list = []

    # 1️⃣ Verarbeitung der Keys auf GPU mit Cupy
    for key in data.files:
        tensor = torch.tensor(data[key], dtype=torch.float32, device="cuda")  # GPU
        all_embeddings.append(tensor)
        key_list.append(key)

    all_embeddings = torch.cat(all_embeddings, dim=0)  # Auf der GPU kombinieren
    print(f"🔢 Gesamtshape vor Isomap: {all_embeddings.shape}")

    # Normalisierung auf der GPU
    all_embeddings = (all_embeddings - all_embeddings.mean()) / all_embeddings.std()

    # Zur CPU zurück, bevor Isomap aufgerufen wird
    all_embeddings_cpu = all_embeddings.cpu().numpy()

    # 2️⃣ CPU-basiertes Isomap Fit & Transform
    reduced_embeddings = isomap.fit_transform(all_embeddings_cpu)

    print(f"✅ Gesamtshape nach Isomap: {reduced_embeddings.shape}")

    # 3️⃣ Zurück auf Keys aufteilen & speichern
    start_idx = 0
    reduced_data = {}

    for i, key in enumerate(key_list):
        num_samples = data[key].shape[0]
        reduced_data[key] = reduced_embeddings[start_idx:start_idx+num_samples]
        start_idx += num_samples

    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    np.savez_compressed(output_path, **reduced_data)
    print(f"💾 Reduzierte Embeddings gespeichert unter: {output_path}")

# === Isomap für Epitope-Embeddings ===
apply_isomap_gpu_preprocessed(
    input_path='../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz',
    output_path='../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz',
    n_components=512,
    n_neighbors=10
)

''' ***TCR MUSS IN BATCHES VERARBEITET WERDEN***
# === Isomap für TCR-Embeddings ===
apply_isomap_gpu_preprocessed(
    input_path='../../data/embeddings/beta/allele/TRB_beta_embeddings.npz',
    output_path='../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz',
    n_components=512,
    n_neighbors=10
)
'''

📂 Lade Datei: ../../data/embeddings/beta/allele/Epitope_beta_embeddings.npz
🔢 Gesamtshape vor Isomap: torch.Size([19260, 1024])
✅ Gesamtshape nach Isomap: (19260, 512)
💾 Reduzierte Embeddings gespeichert unter: ../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz


" ***TCR MUSS IN BATCHES VERARBEITET WERDEN***\n# === Isomap für TCR-Embeddings ===\napply_isomap_gpu_preprocessed(\n    input_path='../../data/embeddings/beta/allele/TRB_beta_embeddings.npz',\n    output_path='../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz',\n    n_components=512,\n    n_neighbors=10\n)\n"

TCRB mit Batches

In [8]:
import numpy as np
import torch
from sklearn.manifold import Isomap
import os

def apply_isomap_gpu_batched(input_path, output_path, n_components=512, n_neighbors=50, batch_size=2000):
    """
    Nutzt GPU für Datenvorbereitung (Normalisierung mit PyTorch), aber verarbeitet TRB in Batches, um Speicherprobleme zu vermeiden.
    """
    print(f"📂 Lade Datei: {input_path}")
    data_npz = np.load(input_path, allow_pickle=True)

    # **Daten in ein veränderbares Dictionary laden & zu float32 konvertieren**
    data = {key: data_npz[key].astype(np.float32) for key in data_npz.files}

    isomap = Isomap(n_neighbors=n_neighbors, n_components=n_components)
    key_list = list(data.keys())

    reduced_embeddings = []

    for i in range(0, len(key_list), batch_size):
        batch_keys = key_list[i:i + batch_size]

        # **1️⃣ Lade Daten zuerst in CPU-Speicher**
        batch = [torch.tensor(data[key], dtype=torch.float32) for key in batch_keys]  # Erst auf CPU
        batch = torch.cat(batch, dim=0).to("cuda")  # Erst dann auf GPU schieben
        print(f"🔢 Batch {i//batch_size + 1}: {batch.shape}")

        # **2️⃣ Normalisierung auf GPU**
        batch = (batch - batch.mean()) / batch.std()

        # **3️⃣ Schiebe den Batch zurück auf die CPU für Isomap**
        batch_cpu = batch.cpu().numpy()
        del batch  # GPU-Speicher leeren
        torch.cuda.empty_cache()

        # **4️⃣ CPU-basiertes Isomap Fit & Transform**
        if i == 0:
            reduced_embeddings = isomap.fit_transform(batch_cpu)
        else:
            reduced_embeddings = np.vstack((reduced_embeddings, isomap.transform(batch_cpu)))

        del batch_cpu
        torch.cuda.empty_cache()

    print(f"✅ Gesamtshape nach Isomap: {reduced_embeddings.shape}")

    # **5️⃣ Zurück auf Keys aufteilen & speichern**
    start_idx = 0
    reduced_data = {}

    for key in key_list:
        num_samples = data[key].shape[0]
        reduced_data[key] = reduced_embeddings[start_idx:start_idx+num_samples]
        start_idx += num_samples

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    np.savez_compressed(output_path, **reduced_data)
    print(f"💾 Reduzierte Embeddings gespeichert unter: {output_path}")

# **Isomap für TCR-Embeddings (TRB ist groß, daher Batches!)**
apply_isomap_gpu_batched(
    input_path='../../data/embeddings/beta/allele/TRB_beta_embeddings.npz',
    output_path='../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz',
    n_components=512,
    n_neighbors=50,
    batch_size=1000  # Kleinere Batches für besseren Swap-Einsatz
)


📂 Lade Datei: ../../data/embeddings/beta/allele/TRB_beta_embeddings.npz
🔢 Batch 1: torch.Size([14580, 1024])
🔢 Batch 2: torch.Size([14716, 1024])
🔢 Batch 3: torch.Size([14751, 1024])
🔢 Batch 4: torch.Size([14754, 1024])
🔢 Batch 5: torch.Size([14679, 1024])
🔢 Batch 6: torch.Size([14719, 1024])
🔢 Batch 7: torch.Size([14591, 1024])
🔢 Batch 8: torch.Size([14607, 1024])
🔢 Batch 9: torch.Size([14665, 1024])
🔢 Batch 10: torch.Size([14767, 1024])
🔢 Batch 11: torch.Size([14652, 1024])
🔢 Batch 12: torch.Size([14705, 1024])
🔢 Batch 13: torch.Size([14602, 1024])
🔢 Batch 14: torch.Size([14757, 1024])
🔢 Batch 15: torch.Size([14637, 1024])
🔢 Batch 16: torch.Size([14634, 1024])
🔢 Batch 17: torch.Size([14692, 1024])
🔢 Batch 18: torch.Size([14615, 1024])
🔢 Batch 19: torch.Size([14678, 1024])
🔢 Batch 20: torch.Size([14754, 1024])
🔢 Batch 21: torch.Size([14638, 1024])
🔢 Batch 22: torch.Size([14723, 1024])
🔢 Batch 23: torch.Size([14628, 1024])
🔢 Batch 24: torch.Size([14535, 1024])
🔢 Batch 25: torch.Size([1

***CHECK NACH DIMENSION REDUCTION***

Keys, Shape, Dimension von Embeddings (reduced mit ISOMAP)

In [9]:
# Lade die reduzierten Dateien nach Isomap
beta_all_epi_reduced = np.load('../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz')
beta_all_trb_reduced = np.load('../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz')

# Erhalte die Keys als Liste
epi_keys_reduced = list(beta_all_epi_reduced.keys())
trb_keys_reduced = list(beta_all_trb_reduced.keys())

# Drucke die ersten 5 und letzten 5 Keys
print(f"🔢 Anzahl der Keys in Epitope_beta_embeddings_reduced: {len(epi_keys_reduced)}")
print(f"📌 Erste 5 Keys: {epi_keys_reduced[:5]}")
print(f"📌 Letzte 5 Keys: {epi_keys_reduced[-5:]}")


print(f"🔢 Anzahl der Keys in TRB_beta_embeddings_reduced: {len(trb_keys_reduced)}")
print(f"📌 Erste 5 Keys: {trb_keys_reduced[:5]}")
print(f"📌 Letzte 5 Keys: {trb_keys_reduced[-5:]}")

# Prüfe die Form der ersten & letzten 5 Keys für beide Embeddings
print("\n🔹 Shapes der ersten 5 Epitope Keys nach Isomap:")
for key in epi_keys_reduced[:5]:
    print(f"  {key}: {beta_all_epi_reduced[key].shape}")

print("\n🔹 Shapes der letzten 5 Epitope Keys nach Isomap:")
for key in epi_keys_reduced[-5:]:
    print(f"  {key}: {beta_all_epi_reduced[key].shape}")

print("\n🔹 Shapes der ersten 5 TRB Keys nach Isomap:")
for key in trb_keys_reduced[:5]:
    print(f"  {key}: {beta_all_trb_reduced[key].shape}")

print("\n🔹 Shapes der letzten 5 TRB Keys nach Isomap:")
for key in trb_keys_reduced[-5:]:
    print(f"  {key}: {beta_all_trb_reduced[key].shape}")



🔢 Anzahl der Keys in Epitope_beta_embeddings_reduced: 1896
📌 Erste 5 Keys: ['KLVVLGINAV', 'GPGHKARVL', 'FRYMNSQGL', 'NMLSTVLGV', 'DSFKEELDKY']
📌 Letzte 5 Keys: ['ISQWLTNIF', 'SILDAVQRV', 'RLMKHYPGI', 'YLKLTDNVYIK', 'VSALSRAAEK']
🔢 Anzahl der Keys in TRB_beta_embeddings_reduced: 208722
📌 Erste 5 Keys: ['CASSQDNNEQFF', 'CASSVISRVGETQYF', 'CASNPTDGGETQYF', 'CASSGSYAPGADTQYF', 'CASSYPGQNNSPLHF']
📌 Letzte 5 Keys: ['CASSLSQGSYSTDTQYF', 'CASSSSGTTKQPQHF', 'CAGRHLLEAFF', 'CASSVDLSSYNEQFF', 'CASSSGLAGLGEQFF']

🔹 Shapes der ersten 5 Epitope Keys nach Isomap:
  KLVVLGINAV: (10, 512)
  GPGHKARVL: (9, 512)
  FRYMNSQGL: (9, 512)
  NMLSTVLGV: (9, 512)
  DSFKEELDKY: (10, 512)

🔹 Shapes der letzten 5 Epitope Keys nach Isomap:
  ISQWLTNIF: (9, 512)
  SILDAVQRV: (9, 512)
  RLMKHYPGI: (9, 512)
  YLKLTDNVYIK: (11, 512)
  VSALSRAAEK: (10, 512)

🔹 Shapes der ersten 5 TRB Keys nach Isomap:
  CASSQDNNEQFF: (12, 512)
  CASSVISRVGETQYF: (15, 512)
  CASNPTDGGETQYF: (14, 512)
  CASSGSYAPGADTQYF: (16, 512)
  CASSYP

***Padding auf ISOMAP***

TRAIN

In [10]:
# === Lade den Trainingsdatensatz ===
train_path = '../../data/splitted_datasets/allele/beta/train.tsv'
train_data = pd.read_csv(train_path, sep='\t', low_memory=False)

# === Lade die reduzierten Isomap-Embeddings ===
tcr_embeddings_path = '../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz'
epitope_embeddings_path = '../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz'

tcr_data = np.load(tcr_embeddings_path, allow_pickle=True)
epitope_data = np.load(epitope_embeddings_path, allow_pickle=True)

# === Extrahiere die Keys aus dem Trainingsdatensatz ===
tcr_keys = train_data['TRB_CDR3'].dropna().tolist()
epitope_keys = train_data['Epitope'].dropna().tolist()

# === Nur Keys behalten, die in den Embeddings existieren ===
'''
wurde erstetzt mit === Schnelle Version mit set() ===
tcr_keys = [key for key in tcr_keys if key in tcr_data]
epitope_keys = [key for key in epitope_keys if key in epitope_data]
'''
# === Schnelle Version mit set() ===
tcr_keys_set = set(tcr_data.files)  # Mache eine schnelle Hash-Map (Set) für Keys
epitope_keys_set = set(epitope_data.files)

tcr_keys = [key for key in tcr_keys if key in tcr_keys_set]
epitope_keys = [key for key in epitope_keys if key in epitope_keys_set]


# === Dictionaries mit den Trainings-Embeddings erstellen ===
tcr_train_dict = {key: tcr_data[key] for key in tcr_keys}
epitope_train_dict = {key: epitope_data[key] for key in epitope_keys}

# === Maximaler Padding-Wert bestimmen ===
max_tcr_length = max(embedding.shape[0] for embedding in tcr_train_dict.values())
max_epitope_length = max(embedding.shape[0] for embedding in epitope_train_dict.values())

max_length = max(max_tcr_length, max_epitope_length)  # Einheitliche Länge für Transformer

print(f"📌 Max Length: {max_length} (TCR: {max_tcr_length}, Epitope: {max_epitope_length})")

# === Padding-Funktion ===
def pad_embedding(embedding, max_length):
    """
    Padded ein einzelnes Embedding mit Nullen auf max_length.
    """
    padded = np.zeros((max_length, embedding.shape[1]), dtype=embedding.dtype)
    padded[:embedding.shape[0], :] = embedding  # Originalwerte behalten, Rest mit 0 füllen
    return padded

# === Speicherpfade für Trainingsdaten setzen ===
train_tcr_padded_path = '../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches'
train_epitope_padded_path = '../../data/embeddings/beta/allele/padded_isomap/train_epitope_padded_batches'

os.makedirs(train_tcr_padded_path, exist_ok=True)
os.makedirs(train_epitope_padded_path, exist_ok=True)

# === Speicher-Funktion mit Batch-Mechanismus ===
def save_padded_embeddings_in_batches(embeddings_dict, save_dir, batch_size=5000):
    keys = list(embeddings_dict.keys())
    num_batches = (len(keys) + batch_size - 1) // batch_size  # Anzahl der Batches berechnen

    for i in range(num_batches):
        batch_keys = keys[i * batch_size: (i + 1) * batch_size]
        padded_batch = {key: pad_embedding(embeddings_dict[key], max_length) for key in batch_keys}
        
        batch_save_path = os.path.join(save_dir, f"batch_{i}.npz")
        np.savez_compressed(batch_save_path, **padded_batch)
        print(f"✅ Saved batch {i + 1}/{num_batches} to {batch_save_path}")

    print("✅ All batches saved successfully!")

# === Train-Embeddings padden und speichern ===
save_padded_embeddings_in_batches(tcr_train_dict, train_tcr_padded_path, batch_size=5000)
save_padded_embeddings_in_batches(epitope_train_dict, train_epitope_padded_path, batch_size=5000)

# === Finale HDF5-Dateien aus gepaddeten Batches erstellen ===
def combine_selected_batches_to_hdf5(batch_files, output_path):
    """
    Kombiniert eine spezifische Liste von Batch-Dateien zu einer einzigen HDF5-Datei.
    """
    if not batch_files:
        print(f"❌ Keine Batch-Dateien in der Liste gefunden.")
        return

    with h5py.File(output_path, 'w') as hdf5_file:
        for i, batch_file in enumerate(batch_files):
            batch = np.load(batch_file, allow_pickle=True)

            for key in batch.files:
                if key not in hdf5_file:
                    hdf5_file.create_dataset(key, data=batch[key], compression="gzip")
                else:
                    print(f"⚠️ Duplikat-Key übersprungen: {key}")

            print(f"🔄 Batch {i+1}/{len(batch_files)} verarbeitet: {batch_file}")

    print(f"✅ Finale gepaddete Embeddings gespeichert unter: {output_path}")

# === TCR & Epitope Batches für Train zusammenführen ===
train_tcr_batches = [f"../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_{i}.npz" for i in range(34)]
train_epitope_batches = ["../../data/embeddings/beta/allele/padded_isomap/train_epitope_padded_batches/batch_0.npz"]

combine_selected_batches_to_hdf5(
    batch_files=train_tcr_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_train_tcr_embeddings_final.h5'
)

combine_selected_batches_to_hdf5(
    batch_files=train_epitope_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_train_epitope_embeddings_final.h5'
)

# === Überprüfe die HDF5-Dateien ===
def check_hdf5_file(file_path):
    with h5py.File(file_path, 'r') as hdf5_file:
        keys = list(hdf5_file.keys())
        print(f"✅ HDF5-Datei geladen: {file_path}")
        print(f"Anzahl Keys: {len(keys)}")
        print(f"Beispiel-Keys: {keys[:5]}")

# Überprüfe Train-TCR
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_train_tcr_embeddings_final.h5')

# Überprüfe Train-Epitope
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_train_epitope_embeddings_final.h5')


📌 Max Length: 43 (TCR: 38, Epitope: 43)
✅ Saved batch 1/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_0.npz
✅ Saved batch 2/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_1.npz
✅ Saved batch 3/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_2.npz
✅ Saved batch 4/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_3.npz
✅ Saved batch 5/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_4.npz
✅ Saved batch 6/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_5.npz
✅ Saved batch 7/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_6.npz
✅ Saved batch 8/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_7.npz
✅ Saved batch 9/34 to ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_8.n

FileNotFoundError: [Errno 2] No such file or directory: '../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_34.npz'

In [11]:
# === Finale HDF5-Dateien aus gepaddeten Batches erstellen ===
def combine_selected_batches_to_hdf5(batch_files, output_path):
    """
    Kombiniert eine spezifische Liste von Batch-Dateien zu einer einzigen HDF5-Datei.
    """
    if not batch_files:
        print(f"❌ Keine Batch-Dateien in der Liste gefunden.")
        return

    with h5py.File(output_path, 'w') as hdf5_file:
        for i, batch_file in enumerate(batch_files):
            batch = np.load(batch_file, allow_pickle=True)

            for key in batch.files:
                if key not in hdf5_file:
                    hdf5_file.create_dataset(key, data=batch[key], compression="gzip")
                else:
                    print(f"⚠️ Duplikat-Key übersprungen: {key}")

            print(f"🔄 Batch {i+1}/{len(batch_files)} verarbeitet: {batch_file}")

    print(f"✅ Finale gepaddete Embeddings gespeichert unter: {output_path}")

# === TCR & Epitope Batches für Train zusammenführen ===
train_tcr_batches = [f"../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_{i}.npz" for i in range(34)]
train_epitope_batches = ["../../data/embeddings/beta/allele/padded_isomap/train_epitope_padded_batches/batch_0.npz"]

combine_selected_batches_to_hdf5(
    batch_files=train_tcr_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_train_tcr_embeddings_final.h5'
)

combine_selected_batches_to_hdf5(
    batch_files=train_epitope_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_train_epitope_embeddings_final.h5'
)

# === Überprüfe die HDF5-Dateien ===
def check_hdf5_file(file_path):
    with h5py.File(file_path, 'r') as hdf5_file:
        keys = list(hdf5_file.keys())
        print(f"✅ HDF5-Datei geladen: {file_path}")
        print(f"Anzahl Keys: {len(keys)}")
        print(f"Beispiel-Keys: {keys[:5]}")

# Überprüfe Train-TCR
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_train_tcr_embeddings_final.h5')

# Überprüfe Train-Epitope
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_train_epitope_embeddings_final.h5')

🔄 Batch 1/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_0.npz
🔄 Batch 2/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_1.npz
🔄 Batch 3/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_2.npz
🔄 Batch 4/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_3.npz
🔄 Batch 5/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_4.npz
🔄 Batch 6/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_5.npz
🔄 Batch 7/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_6.npz
🔄 Batch 8/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_7.npz
🔄 Batch 9/34 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/train_tcr_padded_batches/batch_8.npz
🔄

VALIDATION

In [4]:
max_length = 43

In [5]:
# === Speicher-Funktion mit Batch-Mechanismus ===
def save_padded_embeddings_in_batches(embeddings_dict, save_dir, batch_size=5000):
    keys = list(embeddings_dict.keys())
    num_batches = (len(keys) + batch_size - 1) // batch_size  # Anzahl der Batches berechnen

    for i in range(num_batches):
        batch_keys = keys[i * batch_size: (i + 1) * batch_size]
        padded_batch = {key: pad_embedding(embeddings_dict[key], max_length) for key in batch_keys}
        
        batch_save_path = os.path.join(save_dir, f"batch_{i}.npz")
        np.savez_compressed(batch_save_path, **padded_batch)
        print(f"✅ Saved batch {i + 1}/{num_batches} to {batch_save_path}")

    print("✅ All batches saved successfully!")

# === Finale HDF5-Dateien aus gepaddeten Batches erstellen ===
def combine_selected_batches_to_hdf5(batch_files, output_path):
    """
    Kombiniert eine spezifische Liste von Batch-Dateien zu einer einzigen HDF5-Datei.
    """
    if not batch_files:
        print(f"❌ Keine Batch-Dateien in der Liste gefunden.")
        return

    with h5py.File(output_path, 'w') as hdf5_file:
        for i, batch_file in enumerate(batch_files):
            batch = np.load(batch_file, allow_pickle=True)

            for key in batch.files:
                if key not in hdf5_file:
                    hdf5_file.create_dataset(key, data=batch[key], compression="gzip")
                else:
                    print(f"⚠️ Duplikat-Key übersprungen: {key}")

            print(f"🔄 Batch {i+1}/{len(batch_files)} verarbeitet: {batch_file}")

    print(f"✅ Finale gepaddete Embeddings gespeichert unter: {output_path}")

# === Überprüfe die HDF5-Dateien ===
def check_hdf5_file(file_path):
    with h5py.File(file_path, 'r') as hdf5_file:
        keys = list(hdf5_file.keys())
        print(f"✅ HDF5-Datei geladen: {file_path}")
        print(f"Anzahl Keys: {len(keys)}")
        print(f"Beispiel-Keys: {keys[:5]}")

# === Padding-Funktion ===
def pad_embedding(embedding, max_length):
    """
    Padded ein einzelnes Embedding mit Nullen auf max_length.
    """
    padded = np.zeros((max_length, embedding.shape[1]), dtype=embedding.dtype)
    padded[:embedding.shape[0], :] = embedding  # Originalwerte behalten, Rest mit 0 füllen
    return padded

In [13]:
# === Lade den Validierungsdatensatz ===
validation_path = '../../data/splitted_datasets/allele/beta/validation.tsv'
valid_data = pd.read_csv(validation_path, sep='\t', low_memory=False)

# === Lade die Keys aus dem Validierungsdatensatz ===
valid_tcr_keys = valid_data['TRB_CDR3'].dropna().tolist()
valid_epitope_keys = valid_data['Epitope'].dropna().tolist()

# === Lade die reduzierten Isomap-Embeddings ===
tcr_embeddings_path = '../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz'
epitope_embeddings_path = '../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz'

tcr_data = np.load(tcr_embeddings_path, allow_pickle=True)
epitope_data = np.load(epitope_embeddings_path, allow_pickle=True)

# === Schnelle Version mit set() ===
tcr_keys_set = set(tcr_data.files)  # Mache eine schnelle Hash-Map (Set) für Keys
epitope_keys_set = set(epitope_data.files)

valid_tcr_keys = [key for key in valid_tcr_keys if key in tcr_keys_set]
valid_epitope_keys = [key for key in valid_epitope_keys if key in epitope_keys_set]


In [14]:
# === Dictionaries für Validierungs-Embeddings erstellen ===
valid_tcr_embeddings_dict = {key: tcr_data[key] for key in valid_tcr_keys}
valid_epitope_embeddings_dict = {key: epitope_data[key] for key in valid_epitope_keys}

In [15]:
# === Speicherpfade für Validierungsdaten setzen ===
valid_tcr_padded_path = '../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches'
valid_epitope_padded_path = '../../data/embeddings/beta/allele/padded_isomap/valid_epitope_padded_batches'

os.makedirs(valid_tcr_padded_path, exist_ok=True)
os.makedirs(valid_epitope_padded_path, exist_ok=True)

In [16]:
# === Validierungsdaten padden und speichern ===
save_padded_embeddings_in_batches(valid_tcr_embeddings_dict, valid_tcr_padded_path, batch_size=5000)
save_padded_embeddings_in_batches(valid_epitope_embeddings_dict, valid_epitope_padded_path, batch_size=5000)

✅ Saved batch 1/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_0.npz
✅ Saved batch 2/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_1.npz
✅ Saved batch 3/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_2.npz
✅ Saved batch 4/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_3.npz
✅ Saved batch 5/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_4.npz
✅ Saved batch 6/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_5.npz
✅ Saved batch 7/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_6.npz
✅ Saved batch 8/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_7.npz
✅ Saved batch 9/17 to ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_8.npz
✅ Saved batch 10/17 to ../../data/emb

In [6]:
# === Validierungs-Batches zusammenführen ===
valid_tcr_batches = [f"../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_{i}.npz" for i in range(17)]
valid_epitope_batches = ["../../data/embeddings/beta/allele/padded_isomap/valid_epitope_padded_batches/batch_0.npz"]

combine_selected_batches_to_hdf5(
    batch_files=valid_tcr_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_valid_tcr_embeddings_final.h5'
)

combine_selected_batches_to_hdf5(
    batch_files=valid_epitope_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_valid_epitope_embeddings_final.h5'
)

🔄 Batch 1/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_0.npz
🔄 Batch 2/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_1.npz
🔄 Batch 3/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_2.npz
🔄 Batch 4/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_3.npz
🔄 Batch 5/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_4.npz
🔄 Batch 6/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_5.npz
🔄 Batch 7/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_6.npz
🔄 Batch 8/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_7.npz
🔄 Batch 9/17 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/valid_tcr_padded_batches/batch_8.npz
🔄

In [7]:
# Überprüfe Valid-TCR
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_valid_tcr_embeddings_final.h5')

# Überprüfe Valid-Epitope
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_valid_epitope_embeddings_final.h5')

✅ HDF5-Datei geladen: ../../data/embeddings/beta/allele/padded_isomap/padded_valid_tcr_embeddings_final.h5
Anzahl Keys: 80751
Beispiel-Keys: ['C*EFVGLAGGCTDTQYF', 'C*GTRPSISVPAA**RETGELFF', 'C*GTRPSISVPAACTRGPSYEQYF', 'C*GVGAG*DEQYF', 'C*HRVPTNYGYTF']
✅ HDF5-Datei geladen: ../../data/embeddings/beta/allele/padded_isomap/padded_valid_epitope_embeddings_final.h5
Anzahl Keys: 1515
Beispiel-Keys: ['AAASATLAL', 'AAGIGILTV', 'AALPILFQV', 'AARAVFLAL', 'AARGPHGGAASGL']


TEST

In [8]:
# === Lade den Testdatensatz ===
test_path = '../../data/splitted_datasets/allele/beta/test.tsv'
test_data = pd.read_csv(test_path, sep='\t', low_memory=False)

# === Lade die Keys aus dem Testdatensatz ===
test_tcr_keys = test_data['TRB_CDR3'].dropna().tolist()
test_epitope_keys = test_data['Epitope'].dropna().tolist()

# === Lade die reduzierten Isomap-Embeddings ===
tcr_embeddings_path = '../../data/embeddings/beta/allele/isomap/TRB_beta_embeddings_reduced.npz'
epitope_embeddings_path = '../../data/embeddings/beta/allele/isomap/Epitope_beta_embeddings_reduced.npz'

tcr_data = np.load(tcr_embeddings_path, allow_pickle=True)
epitope_data = np.load(epitope_embeddings_path, allow_pickle=True)

# === Schnelle Version mit set() ===
tcr_keys_set = set(tcr_data.files)  # Mache eine schnelle Hash-Map (Set) für Keys
epitope_keys_set = set(epitope_data.files)

test_tcr_keys = [key for key in test_tcr_keys if key in tcr_keys_set]
test_epitope_keys = [key for key in test_epitope_keys if key in epitope_keys_set]

# === Dictionaries für Test-Embeddings erstellen ===
test_tcr_embeddings_dict = {key: tcr_data[key] for key in test_tcr_keys}
test_epitope_embeddings_dict = {key: epitope_data[key] for key in test_epitope_keys}

# === Speicherpfade für Testdaten setzen ===
test_tcr_padded_path = '../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches'
test_epitope_padded_path = '../../data/embeddings/beta/allele/padded_isomap/test_epitope_padded_batches'

os.makedirs(test_tcr_padded_path, exist_ok=True)
os.makedirs(test_epitope_padded_path, exist_ok=True)

# === Testdaten padden und speichern ===
save_padded_embeddings_in_batches(test_tcr_embeddings_dict, test_tcr_padded_path, batch_size=5000)
save_padded_embeddings_in_batches(test_epitope_embeddings_dict, test_epitope_padded_path, batch_size=5000)

✅ Saved batch 1/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_0.npz
✅ Saved batch 2/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_1.npz
✅ Saved batch 3/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_2.npz
✅ Saved batch 4/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_3.npz
✅ Saved batch 5/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_4.npz
✅ Saved batch 6/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_5.npz
✅ Saved batch 7/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_6.npz
✅ Saved batch 8/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_7.npz
✅ Saved batch 9/9 to ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_8.npz
✅ All batches saved successfully!
✅ Saved batch 1/1 to 

In [10]:
# === Test-Batches zusammenführen ===
test_tcr_batches = [f"../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_{i}.npz" for i in range(9)]
test_epitope_batches = ["../../data/embeddings/beta/allele/padded_isomap/test_epitope_padded_batches/batch_0.npz"]

combine_selected_batches_to_hdf5(
    batch_files=test_tcr_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_test_tcr_embeddings_final.h5'
)

combine_selected_batches_to_hdf5(
    batch_files=test_epitope_batches,
    output_path='../../data/embeddings/beta/allele/padded_isomap/padded_test_epitope_embeddings_final.h5'
)

# === Überprüfe Test-HDF5-Dateien ===
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_test_tcr_embeddings_final.h5')
check_hdf5_file('../../data/embeddings/beta/allele/padded_isomap/padded_test_epitope_embeddings_final.h5')

🔄 Batch 1/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_0.npz
🔄 Batch 2/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_1.npz
🔄 Batch 3/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_2.npz
🔄 Batch 4/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_3.npz
🔄 Batch 5/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_4.npz
🔄 Batch 6/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_5.npz
🔄 Batch 7/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_6.npz
🔄 Batch 8/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_7.npz
🔄 Batch 9/9 verarbeitet: ../../data/embeddings/beta/allele/padded_isomap/test_tcr_padded_batches/batch_8.npz
✅ Finale gepaddete 

***CHECK NACH PADDING UND ALLEM***