In [12]:
import pandas as pd
import os

In [13]:
# Definierte Pfade für alle vier Kategorien
datasets = {
    "beta_allele": {
        "train": f'../../../../data/splitted_datasets/allele/beta/new/beta/train.tsv',
        "test": f'../../../../data/splitted_datasets/allele/beta/new/beta/test.tsv',
        "validation": f'../../../../data/splitted_datasets/allele/beta/new/beta/validation.tsv'
    }
}

1. Alle TPP-Teildateien (*_tpp2.tsv, *_tpp3.tsv…) einzulesen

2. Zeilen- und Binding-Verteilungen je Datei anzuzeigen

3. alle TPP-Chunks zu einem DataFrame zusammenzufassen und mit der Master-Datei (test_neg.tsv bzw. val_neg.tsv, train_neg.tsv) abzugleichen


In [14]:
import os
import glob
import pandas as pd

# Pfad zum Negatives-Ordner
neg_dir = "../../../../data/splitted_datasets/allele/beta/new/negatives"

def summarize_file(path):
    df = pd.read_csv(path, sep="\t")
    name = os.path.basename(path)
    print(f"\n--- {name} ---")
    print(f"Rows: {len(df)}")
    print("Binding value counts:")
    print(df["Binding"].value_counts(dropna=False))
    print("Unique (TCR, Epitope) pairs:", 
          df[["TRB_CDR3","Epitope"]].drop_duplicates().shape[0])
    return df

# 1) alle TPP-Teildateien einlesen und kurz summarizen
tpp_files = sorted(glob.glob(os.path.join(neg_dir,"*_tpp*.tsv")))
tpp_dfs = [summarize_file(f) for f in tpp_files]

# 2) zu einem DataFrame zusammenführen
all_tpp = pd.concat(tpp_dfs, ignore_index=True)
print("\n=== Zusammengefasst alle TPP-Teile ===")
print("Total rows:", len(all_tpp))
print("Binding value counts:")
print(all_tpp["Binding"].value_counts())
print("Unique (TCR, Epitope) pairs:", 
      all_tpp[["TRB_CDR3","Epitope"]].drop_duplicates().shape[0])

# 3) mit der Master-Datei vergleichen (z.B. test_neg.tsv)
master = pd.read_csv(os.path.join(neg_dir, "test_neg.tsv"), sep="\t")
print("\n=== Master test_neg.tsv ===")
print("Total rows:", len(master))
print("Binding value counts:")
print(master["Binding"].value_counts())
print("Unique pairs:", master[["TRB_CDR3","Epitope"]].drop_duplicates().shape[0])

# 4) Stimmen Zeilenanzahl und Paar-Mengen überein?
print("\nCheck row-count match:", len(master)==len(all_tpp))
print("Check pair-set equality:", 
      set(zip(master["TRB_CDR3"],master["Epitope"])) 
    == set(zip(all_tpp["TRB_CDR3"],all_tpp["Epitope"])))



--- test_neg_tpp2.tsv ---
Rows: 27398
Binding value counts:
Binding
0    27398
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 25741

--- test_neg_tpp3.tsv ---
Rows: 43689
Binding value counts:
Binding
0    43689
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 40909

--- test_neg_tpp4.tsv ---
Rows: 165
Binding value counts:
Binding
0    165
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 165

--- train_neg_tpp3.tsv ---
Rows: 0
Binding value counts:
Series([], Name: count, dtype: int64)
Unique (TCR, Epitope) pairs: 0

--- val_neg_tpp2.tsv ---
Rows: 58344
Binding value counts:
Binding
0    58344
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 55888

--- val_neg_tpp3.tsv ---
Rows: 58280
Binding value counts:
Binding
0    58280
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 55820

--- val_neg_tpp4.tsv ---
Rows: 450
Binding value counts:
Binding
0    450
Name: count, dtype: int64
Unique (TCR, Epitope) pairs: 450

=== Zusammengefasst alle TPP-Teile ===
To

1. Neg-Pairs in validation/test: Wie viele generierten Negatives tatsächlich in den finalen Splits gelandet sind.

2. Neg-Pairs als POS: Falls hier etwas > 0 rauskommt, wurden Negative versehentlich als Positive gelabelt.

In [15]:
import os, glob, pandas as pd

# 1) Pfade anpassen
base_dir = "../../../../data/splitted_datasets/allele/beta/new"
neg_dir  = os.path.join(base_dir, "negatives")
val_path = os.path.join(base_dir, "validation.tsv")
test_path= os.path.join(base_dir, "test.tsv")

# Alle TPP-neg Dateien einlesen und zusammenführen
neg_files = sorted(glob.glob(os.path.join(neg_dir, "*_neg_tpp*.tsv")))
negs = pd.concat([pd.read_csv(f, sep="\t", dtype={"TRB_CDR3": str, "Epitope": str}) 
                  for f in neg_files],
                 ignore_index=True)

# validation & test laden (Binding sicher als int lesen)
val  = pd.read_csv(val_path, sep="\t", dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int})
test = pd.read_csv(test_path, sep="\t", dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int})

# Sets basteln
neg_pairs       = set(zip(negs["TRB_CDR3"], negs["Epitope"]))
val_pairs       = set(zip(val["TRB_CDR3"],  val["Epitope"]))
test_pairs      = set(zip(test["TRB_CDR3"], test["Epitope"]))

# 1) Wie viele Neg-Pairs landen überhaupt in den Splits?
print("Neg-Pairs in validation.tsv:", len(neg_pairs & val_pairs))
print("Neg-Pairs in test.tsv:      ", len(neg_pairs & test_pairs))

# 2) Gibt’s Neg-Pairs, die fälschlich als Positives gelabelt wurden?
pos_val_pairs  = set(zip(val[val["Binding"] == 1]["TRB_CDR3"],  val[val["Binding"] == 1]["Epitope"]))
pos_test_pairs = set(zip(test[test["Binding"] == 1]["TRB_CDR3"], test[test["Binding"] == 1]["Epitope"]))

false_pos_in_val  = pos_val_pairs  & neg_pairs
false_pos_in_test = pos_test_pairs & neg_pairs

print("Neg-Pairs als POS in validation.tsv:", len(false_pos_in_val))
print("Neg-Pairs als POS in test.tsv:      ", len(false_pos_in_test))


  val  = pd.read_csv(val_path, sep="\t", dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int})


Neg-Pairs in validation.tsv: 62824
Neg-Pairs in test.tsv:       31500
Neg-Pairs als POS in validation.tsv: 8110
Neg-Pairs als POS in test.tsv:       2737


GENAUER TEST: 
1. Neg-Pairs in validation/test: Wie viele generierten Negatives tatsächlich in den finalen Splits gelandet sind.

2. Neg-Pairs als POS: Falls hier etwas > 0 rauskommt, wurden Negative versehentlich als Positive gelabelt.

In [16]:
import os
import glob
import pandas as pd

# 1) Pfade definieren
base_dir = "../../../../data/splitted_datasets/allele/beta/new"
neg_dir  = os.path.join(base_dir, "negatives")
val_path = os.path.join(base_dir, "validation.tsv")
test_path= os.path.join(base_dir, "test.tsv")

# 2) TPP-2, -3 und -4 Negatives laden
patterns = ["*_neg_tpp2.tsv", "*_neg_tpp3.tsv", "*_neg_tpp4.tsv"]
all_negs = []
for pat in patterns:
    files = sorted(glob.glob(os.path.join(neg_dir, pat)))
    print(f"Found {len(files)} files for pattern '{pat}':", files)
    for fn in files:
        df = pd.read_csv(fn, sep="\t",
                         dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int},
                         low_memory=False)
        all_negs.append(df)

# Falls keine Dateien, leeres DataFrame
if all_negs:
    negs = pd.concat(all_negs, ignore_index=True)
else:
    negs = pd.DataFrame(columns=["TRB_CDR3","Epitope","Binding"])

# 3) Unique-Paare als Set
neg_pairs = set(zip(negs["TRB_CDR3"], negs["Epitope"]))
print(f"\n Generierte TPP2+3+4 Negative-Paare (unique): {len(neg_pairs)}")

# 4) validation.tsv und test.tsv laden
val  = pd.read_csv(val_path,  sep="\t",
                   dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int},
                   low_memory=False)
test = pd.read_csv(test_path, sep="\t",
                   dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int},
                   low_memory=False)

# 5) Sets aus den Splits
val_pairs  = set(zip(val["TRB_CDR3"],  val["Epitope"]))
test_pairs = set(zip(test["TRB_CDR3"], test["Epitope"]))

# 6) Schnittmengen zählen
in_val  = neg_pairs & val_pairs
in_test = neg_pairs & test_pairs

print(f"\n TPP2+3+4 Negatives in validation.tsv: {len(in_val)}")
print(f" TPP2+3+4 Negatives in test.tsv:       {len(in_test)}")

# 7) Fälschlich als positiv gelabelt?
pos_val_pairs  = set(zip(val[val["Binding"] == 1]["TRB_CDR3"],  val[val["Binding"] == 1]["Epitope"]))
pos_test_pairs = set(zip(test[test["Binding"] == 1]["TRB_CDR3"], test[test["Binding"] == 1]["Epitope"]))

false_pos_val  = neg_pairs & pos_val_pairs
false_pos_test = neg_pairs & pos_test_pairs

print(f"\n  TPP2+3+4 Neg-Paare als POS in validation.tsv: {len(false_pos_val)}")
print(f"  TPP2+3+4 Neg-Paare als POS in test.tsv:       {len(false_pos_test)}")


Found 2 files for pattern '*_neg_tpp2.tsv': ['../../../../data/splitted_datasets/allele/beta/new/negatives/test_neg_tpp2.tsv', '../../../../data/splitted_datasets/allele/beta/new/negatives/val_neg_tpp2.tsv']
Found 3 files for pattern '*_neg_tpp3.tsv': ['../../../../data/splitted_datasets/allele/beta/new/negatives/test_neg_tpp3.tsv', '../../../../data/splitted_datasets/allele/beta/new/negatives/train_neg_tpp3.tsv', '../../../../data/splitted_datasets/allele/beta/new/negatives/val_neg_tpp3.tsv']
Found 2 files for pattern '*_neg_tpp4.tsv': ['../../../../data/splitted_datasets/allele/beta/new/negatives/test_neg_tpp4.tsv', '../../../../data/splitted_datasets/allele/beta/new/negatives/val_neg_tpp4.tsv']

 Generierte TPP2+3+4 Negative-Paare (unique): 159389

 TPP2+3+4 Negatives in validation.tsv: 62824
 TPP2+3+4 Negatives in test.tsv:       31500

  TPP2+3+4 Neg-Paare als POS in validation.tsv: 8110
  TPP2+3+4 Neg-Paare als POS in test.tsv:       2737


1. die TPP-2, -3 und -4 Negativ-Files jeweils mit einem TPP-Label einliest

2. die finalen validation.tsv und test.tsv läd

3. aus den Splits die falsch als positiv (Binding == 1) gelabelten Paare ermittelt,

4. diese mit den TPP-Labels verbindet

5. und dir pro TPP zeigt, wie viele False-Positives aus der jeweiligen Kategorie stammen.

In [24]:
import os, glob, pandas as pd

# 1) Basis-Pfade
base_dir = "../../../../data/splitted_datasets/allele/beta/new"
neg_dir  = os.path.join(base_dir, "negatives")
val_path = os.path.join(base_dir, "validation.tsv")
test_path= os.path.join(base_dir, "test.tsv")

# 2) Alle TPP-Negatives laden und mit TPP-Spalte markieren
def load_tpp_negs(tpp):
    files = sorted(glob.glob(os.path.join(neg_dir, f"* _neg_tpp{tpp}.tsv".replace(" ",""))))
    dfs = []
    for fn in files:
        df = pd.read_csv(fn, sep="\t",
                         dtype={"TRB_CDR3": str, "Epitope": str},
                         usecols=["TRB_CDR3","Epitope"])
        df["TPP"] = tpp
        df["Binding_neg"] = 0
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=["TRB_CDR3","Epitope","TPP","Binding_neg"])

negs = pd.concat([load_tpp_negs(2), load_tpp_negs(3), load_tpp_negs(4)], ignore_index=True)
neg_pairs = set(zip(negs["TRB_CDR3"], negs["Epitope"]))

# 3) validation & test laden (mit dem Split-Label)
val  = pd.read_csv(val_path,  sep="\t", dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int}, low_memory=False)
test = pd.read_csv(test_path, sep="\t", dtype={"TRB_CDR3": str, "Epitope": str, "Binding": int}, low_memory=False)

# 4) False-Positives herausfiltern: join Split-Positives (Binding==1) mit negs
key = ["TRB_CDR3","Epitope"]
fp_val = pd.merge(
    val[val["Binding"]==1][key+["Binding"]].rename(columns={"Binding":"Binding_split"}),
    negs,
    on=key,
    how="inner"
)
fp_test = pd.merge(
    test[test["Binding"]==1][key+["Binding"]].rename(columns={"Binding":"Binding_split"}),
    negs,
    on=key,
    how="inner"
)

# 5) Zusammenfassung je TPP
summary_val  = fp_val.groupby("TPP").size().reset_index(name="FalsePos_in_Val")
summary_test = fp_test.groupby("TPP").size().reset_index(name="FalsePos_in_Test")

print("\n=== False-Positives in validation.tsv nach TPP ===")
print(summary_val.to_string(index=False))

print("\n=== False-Positives in test.tsv nach TPP ===")
print(summary_test.to_string(index=False))

# 6) Beispiel-Tabelle ausgeben, um zu sehen, welche Labels stehen
print("\nBeispiel False-Positives (TPP=2) in validation.tsv:")
print(fp_val[fp_val["TPP"]==3].head(10).to_string(index=False))



=== False-Positives in validation.tsv nach TPP ===
 TPP  FalsePos_in_Val
   2             5617
   3             5574

=== False-Positives in test.tsv nach TPP ===
 TPP  FalsePos_in_Test
   2              1423
   3              2047

Beispiel False-Positives (TPP=2) in validation.tsv:
          TRB_CDR3   Epitope  Binding_split  TPP  Binding_neg
  CATSELGGGLTDEQFF KLGGALQAK              1    3            0
  CASSSRGTENTGELFF KLGGALQAK              1    3            0
  CASSLGGFSSYNEQFF KLGGALQAK              1    3            0
  CASSPPRQGANTEAFF KLGGALQAK              1    3            0
CATSIRLRPWQGGDEQFF KLGGALQAK              1    3            0
    CASSLSGGVTEAFF KLGGALQAK              1    3            0
  CASSLGVGDGQETQYF KLGGALQAK              1    3            0
   CASSQLGLAGDEQFF KLGGALQAK              1    3            0
   CATSRLASSYNEQFF KLGGALQAK              1    3            0
 CASSFGQGPLYVDGYTF KLGGALQAK              1    3            0


In [23]:
import os, glob
import pandas as pd

# 1) Basis-Pfade
base_dir = "../../../../data/splitted_datasets/allele/beta/new"
neg_dir  = os.path.join(base_dir, "negatives")
val_path = os.path.join(base_dir, "validation.tsv")
test_path= os.path.join(base_dir, "test.tsv")

# 2) Alle TPP-Negatives einlesen und mapping pair -> TPP bauen
all_negs = []
for tpp in [2,3,4]:
    files = sorted(glob.glob(os.path.join(neg_dir, f"* _neg_tpp{tpp}.tsv".replace(" ",""))))
    for fn in files:
        df = pd.read_csv(fn, sep="\t", dtype=str, usecols=["TRB_CDR3","Epitope"])
        df["TPP"] = tpp
        all_negs.append(df)
negs = pd.concat(all_negs, ignore_index=True)

# Mapping (TCR,Epitope) -> TPP (bei Duplikaten wird der erste TPP behalten)
pair_to_tpp = {
    (r.TRB_CDR3, r.Epitope): r.TPP
    for r in negs.itertuples()
}

# 3) validation & test laden
val  = pd.read_csv(val_path,  sep="\t", dtype={"TRB_CDR3":str,"Epitope":str,"Binding":int})
test = pd.read_csv(test_path, sep="\t", dtype={"TRB_CDR3":str,"Epitope":str,"Binding":int})

# 4) Maske für False-Positives in den Original-DFs
mask_val = val["Binding"].eq(1) & val.apply(
    lambda r: (r["TRB_CDR3"], r["Epitope"]) in pair_to_tpp, axis=1
)
mask_test = test["Binding"].eq(1) & test.apply(
    lambda r: (r["TRB_CDR3"], r["Epitope"]) in pair_to_tpp, axis=1
)

# 5) Subsets ziehen und TPP-Spalte hinzufügen
fp_val_orig = val.loc[mask_val].copy()
fp_val_orig["TPP"]  = fp_val_orig.apply(
    lambda r: pair_to_tpp[(r["TRB_CDR3"], r["Epitope"])], axis=1
)

fp_test_orig = test.loc[mask_test].copy()
fp_test_orig["TPP"] = fp_test_orig.apply(
    lambda r: pair_to_tpp[(r["TRB_CDR3"], r["Epitope"])], axis=1
)

# 6) Ausgabe – im Original-Format, nur die False-Positives
print("\nErste 5 False-Positives in validation.tsv:")
print(
    fp_val_orig
    .head(5)[["TRB_CDR3","Epitope","Binding","TPP"]]
    .to_string(index=False)
)

print("\nErste 5 False-Positives in test.tsv:")
print(
    fp_test_orig
    .head(5)[["TRB_CDR3","Epitope","Binding","TPP"]]
    .to_string(index=False)
)


  val  = pd.read_csv(val_path,  sep="\t", dtype={"TRB_CDR3":str,"Epitope":str,"Binding":int})



Erste 5 False-Positives in validation.tsv:
          TRB_CDR3   Epitope  Binding  TPP
  CATSELGGGLTDEQFF KLGGALQAK        1    3
  CASSSRGTENTGELFF KLGGALQAK        1    3
  CASSLGGFSSYNEQFF KLGGALQAK        1    3
  CASSPPRQGANTEAFF KLGGALQAK        1    3
CATSIRLRPWQGGDEQFF KLGGALQAK        1    3

Erste 5 False-Positives in test.tsv:
        TRB_CDR3    Epitope  Binding  TPP
CASSQDGGSSYNEQFF  GLCTLVAML        1    3
  CASSRTGSDYGYTF ELAGIGILTV        1    2
CASSSPQGVSNTEAFF  GILGFVFTL        1    3
 CASSIMALGRSEAFF  KLGGALQAK        1    3
 CASSLDRGVNTEAFF  GILGFVFTL        1    2


# CHECK COS_Siminlarity von generierten negativen zu positiven

In [2]:
# prüfe, welche Source-Kategorien es gibt
print("Validation source categories:", val_df["source"].unique())
print("Test source categories:      ", test_df["source"].unique())


Validation source categories: ['datasets' 'generated' '10X']
Test source categories:       ['datasets' 'generated' '10X']


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# du hast val_df und test_df schon geladen, also überspring das Einlesen hier —
# ansonsten hier anpassen:
val_df  = pd.read_csv("../../../../data/splitted_datasets/allele/beta/new/validation.tsv", sep="\t", dtype=str)
test_df = pd.read_csv("../../../../data/splitted_datasets/allele/beta/new/test.tsv",       sep="\t", dtype=str)

# Binding als int casten
val_df ["Binding"] = val_df["Binding"].astype(int)
test_df["Binding"] = test_df["Binding"].astype(int)

# --- Alle positiven Epitope sammeln (source=='datasets') ---
pos_val_epitopes  = val_df [ val_df ["source"]=="datasets" ]["Epitope"]
pos_test_epitopes = test_df[ test_df["source"]=="datasets" ]["Epitope"]
pos_epitopes = pd.concat([pos_val_epitopes, pos_test_epitopes]).unique().tolist()
print(f"→ Einzigartige positive Epitopes (datasets): {len(pos_epitopes)}")

# --- Die generierten Negatives herausfiltern ---
neg_val   = val_df [ (val_df ["source"]=="generated") & (val_df ["Binding"]==0) ].copy()
neg_test  = test_df[ (test_df["source"]=="generated") & (test_df["Binding"]==0) ].copy()
print(f"→ Generierte Negatives (Validation): {len(neg_val)}")
print(f"→ Generierte Negatives (Test):       {len(neg_test)}")

# --- Levenshtein-Funktionen ---
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    prev = list(range(len(s2)+1))
    for i,c1 in enumerate(s1,1):
        curr = [i]
        for j,c2 in enumerate(s2,1):
            ins = prev[j] + 1
            dele= curr[j-1] + 1
            sub = prev[j-1] + (c1 != c2)
            curr.append(min(ins, dele, sub))
        prev = curr
    return prev[-1]

def similarity_ratio(s1, s2):
    d = levenshtein_distance(s1, s2)
    return (len(s1) + len(s2) - d) / (len(s1) + len(s2))

# Max-Similarity gegen alle positiven Epitopes
def max_sim_to_positives(seq):
    # Achtung: das ist O(len(pos_epitopes)), kann langsam sein
    return max(similarity_ratio(seq, pos) for pos in pos_epitopes)

# wende an
neg_val  ["max_pos_sim"] = neg_val ["Epitope"].apply(max_sim_to_positives)
neg_test ["max_pos_sim"] = neg_test["Epitope"].apply(max_sim_to_positives)

# Kurze Zusammenfassung + Histogramm
for df, name in [(neg_val, "Validation"), (neg_test, "Test")]:
    print(f"\n{name} – Ähnlichkeits-Statistik generierte Negatives → Positives")
    print(df["max_pos_sim"].describe())
    plt.figure(figsize=(6,4))
    plt.hist(df["max_pos_sim"], bins=30, alpha=0.7)
    plt.axvline(0.75, color="red", linestyle="--", label="Beispiel-Threshold 0.75")
    plt.title(f"Max. Levenshtein-Ratio generated Negatives → Positives ({name})")
    plt.xlabel("Similarity-Ratio")
    plt.ylabel("Count")
    plt.legend()
    plt.show()

# Auf Wunsch: pro TPP aufschlüsseln
if "task_predicted" in neg_val.columns:
    for tpp in sorted(neg_val["task_predicted"].unique()):
        sub = neg_val[neg_val["task_predicted"]==tpp]
        print(f"\n{name} – TPP={tpp}: n={len(sub)}, max_pos_sim median={sub['max_pos_sim'].median():.3f}")




Validation source categories: ['datasets' 'generated' '10X']
Test       source categories: ['datasets' 'generated' '10X']
→ Einzigartige positive Epitopes (datasets): 1598
→ Generierte Negatives (Validation): 124790
→ Generierte Negatives (Test):       45277
