In [None]:
import pandas as pd
import os

In [None]:
file_paths = {
    "VDJdb_beta": vdjdb_beta_read_path,
    "McPAS_beta": mcpastcr_beta_read_path,
    "IEDB_beta": iedb_beta_read_path,
    "pMTnet_beta": pmtnet_beta_read_path,
    "VDJdb_paired": vdjdb_paired_read_path,
    "McPAS_paired": mcpastcr_paired_read_path,
    "IEDB_paired": iedb_paired_read_path
}

In [None]:
# Funktion zur Berechnung der TPP-Kategorien pro Datei (vom classification File übernommen: data_scripts/data_preparation/classification.ipynb)
def calculate_task(row, known_epitopes, known_tcrs, paired):
    """Weist TPP1-4 basierend auf bekannten Epitope & TCRs zu."""
    if paired:
        tra_cdr3 = str(row['TRA_CDR3']) if pd.notna(row['TRA_CDR3']) else ''
        trb_cdr3 = str(row['TRB_CDR3']) if pd.notna(row['TRB_CDR3']) else ''
        tcr = tra_cdr3 + '_' + trb_cdr3
    else:
        tcr = row['TRB_CDR3']

    epitope_exists = row['Epitope'] in known_epitopes
    cdr3_exists = tcr in known_tcrs

    if epitope_exists and cdr3_exists:
        return 'TPP1'
    elif epitope_exists and not cdr3_exists:
        return 'TPP2'
    elif not epitope_exists and not cdr3_exists:
        return 'TPP3'
    elif not epitope_exists and cdr3_exists:
        return 'TPP4'
    else:
        return 'Unknown'  # Falls etwas schiefgeht

# Durch alle Dateien iterieren und analysieren
for file_name, path in file_paths.items():
    if os.path.exists(path):
        try:
            df = pd.read_csv(path, sep=None, engine="python")
            
            required_columns = {"TRB_CDR3", "Epitope"}
            if "TRA_CDR3" in df.columns:
                paired = True
            else:
                paired = False
            
            missing_columns = required_columns - set(df.columns)
            if missing_columns:
                print(f"Fehlende Spalten in {file_name}: {missing_columns}")
                continue

            if paired:
                df["tcr_key"] = df['TRA_CDR3'].astype(str) + '_' + df['TRB_CDR3'].astype(str)
            else:
                df["tcr_key"] = df['TRB_CDR3']

            # Berechnung der TCR- und Epitope-Statistiken
            distinct_tcrs = df["tcr_key"].nunique()
            unique_tcrs = df[df.duplicated(subset=["tcr_key"], keep=False) == False]["tcr_key"].count()
            distinct_epitopes = df["Epitope"].nunique()
            unique_epitopes = df[df.duplicated(subset=["Epitope"], keep=False) == False]["Epitope"].count()

            # Identifikation von bekannten Epitope und TCRs innerhalb der Datei selbst
            seen_epitopes = set(df["Epitope"].dropna())
            seen_tcrs = set(df["tcr_key"].dropna())

            df["task"] = df.apply(lambda x: calculate_task(x, seen_epitopes, seen_tcrs, paired), axis=1)
            tpp_counts = df["task"].value_counts().to_dict()
            tpp1_count = tpp_counts.get("TPP1", 0)
            tpp2_count = tpp_counts.get("TPP2", 0)
            tpp3_count = tpp_counts.get("TPP3", 0)
            tpp4_count = tpp_counts.get("TPP4", 0)

            print(f"**{file_name}**")
            print(f"  - Distinct TCRs: {distinct_tcrs}")
            print(f"  - Unique TCRs: {unique_tcrs}")
            print(f"  - Distinct Epitopes: {distinct_epitopes}")
            print(f"  - Unique Epitopes: {unique_epitopes}")
            '''print(f"  - TPP1: {tpp1_count}")
            print(f"  - TPP2: {tpp2_count}")
            print(f"  - TPP3: {tpp3_count}")
            print(f"  - TPP4: {tpp4_count}\n")'''

        except Exception as e:
            print(f"Fehler beim Verarbeiten der Datei {file_name}: {e}")

    else:
        print(f"Datei nicht gefunden: {file_name}")