In [1]:
from pathlib import Path
from typing import Tuple

import editdistance
import pandas as pd

**This notebook loads the dataset from TULIP, applies filters to remove redundant pairs, too long sequences, samples with missing sequences, samples too close between train and test sets. Negative pairs are sampled by randomly pairing the CDR3b sequences to peptides.**  
<sub>Note: It uses multi-level stratified sampling across peptide groups,
    making a single fixed seed insufficient for consistent results across datasets
    of varying compositions. Data split used in study is provided for reproducibility.</sub>

In [2]:
folder_dataset_out = Path('./dataset_sampled')
folder_dataset_out.mkdir(parents=True, exist_ok=True)

In [5]:
!wget https://github.com/barthelemymp/TULIP-TCR/raw/main/data/UnseenPeptides.zip
!unzip UnseenPeptides.zip
!rm UnseenPeptides.zip

--2025-04-03 12:54:12--  https://github.com/barthelemymp/TULIP-TCR/raw/main/data/UnseenPeptides.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/barthelemymp/TULIP-TCR/main/data/UnseenPeptides.zip [following]
--2025-04-03 12:54:12--  https://raw.githubusercontent.com/barthelemymp/TULIP-TCR/main/data/UnseenPeptides.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44992958 (43M) [application/zip]
Saving to: ‘UnseenPeptides.zip’


2025-04-03 12:54:13 (188 MB/s) - ‘UnseenPeptides.zip’ saved [44992958/44992958]

Archive:  UnseenPeptides.zip
   creating: UnseenPeptides/
  inflating: UnseenPeptid

In [6]:
train = pd.read_csv("UnseenPeptides/final_below20_train_pnas.csv")
test = pd.read_csv("UnseenPeptides/test_unseen.csv")

In [7]:
def random_pairing(df: pd.DataFrame) -> pd.DataFrame:
    """Random pairing of CDR3b to peptides to sample negative pairs.
    
    For a given peptide, the function first derives the set of non-positive pairs.
    This set consists of all CDR3b paired to other peptides than the given peptide.
    Then the CDR3b in this non-positive set are sampled such that each peptide
    in the non-positive set has equal probability.

    Args:
        df (pd.DataFrame): DataFrame containing the positive pairs.
    
    Returns:
        df_all (pd.DataFrame): DataFrame with all pairs (positive and negatives).        
    """
    df_all = df.copy()
    all_peptides = df.peptide.unique()

    if len(all_peptides) == 1:
        raise ValueError("There has to be more than a single peptide per fold!")
    for peptide in all_peptides:
        subset = df.loc[df.peptide==peptide]
        pos_pairs = subset.cdr3b.unique()
        non_posset = df.loc[~(df.cdr3b.isin(pos_pairs))]

        subset_sampled = non_posset.groupby('peptide').sample(subset.shape[0], replace=True).sample(subset.shape[0], replace=True)
        subset_sampled.reset_index(inplace=True, drop=True)
        subset.reset_index(inplace=True, drop=True)

        subset_sampled[['peptide', 'mhc']] = subset[['peptide', 'mhc']]
        subset_sampled['binder'] = 0.0
        df_all = pd.concat([df_all, subset_sampled], ignore_index=True)        

    return df_all        


In [8]:
def filter_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Filter the dataset.
    
    Keep only samples with MHC-I, non missing CDR3b/peptide and peptide less than 16 residues.
    
    Args:
        df (pd.DataFrame): DataFrame to be filtered.
        
    Returns:
        df (pd.DataFrame): DataFrame filtered.    
    """
    df = df.loc[(df.mhc.apply(lambda s: s[:5] in ["HLA-A", "HLA-B", "HLA-C"])) | (df.mhc=="HLA class I")]
    df = df.loc[(df.cdr3b!='<MIS>') & (df.peptide!="<MIS>")]
    df = df.loc[df['peptide'].apply(len)<16]
    df = df.loc[df.cdr3b.apply(lambda s: len(s))<34]
    df = df.loc[df.cdr3b.apply(lambda s: len(s))<34]
    return df

In [9]:
def remove_close_samples(train: pd.DataFrame, test: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Remove any samples too close between train and test.

    Samples are removed if the CDR3b is present in train and test sets 
    and the associated train/test peptides are at edit distance 1.

    Args:
        train (pd.DataFrame): Train set dataframe.
        test (pd.DataFrame): Test set dataframe.

    Returns:
        train (pd.DataFrame): Train set dataframe.
        test (pd.DataFrame): Test set dataframe.
    """
    for pep_test in test.peptide.unique():
        for pep_train in train.peptide.unique():
            dist = editdistance.eval(pep_test, pep_train)
            if dist == 1:
                common_tcrs = set(test['cdr3b'].loc[test.peptide==pep_test]).intersection(set(train['cdr3b'].loc[train.peptide==pep_train]))
                common_tcrs = list(common_tcrs)
                train = train.loc[~train['cdr3b'].isin(common_tcrs)]
                test = test.loc[~test['cdr3b'].isin(common_tcrs)]
                train = train.reset_index(drop=True)
                test = test.reset_index(drop=True)
                for tcr in common_tcrs:
                    assert tcr not in train['cdr3b'].unique()
                    assert tcr not in test['cdr3b'].unique()
    return train, test

In [10]:
# Keep positive pairs only
train = train.loc[train.binder==1.0]
test = test.loc[test.binder==1.0]

# Filter columns and rename them.
train = train[['CDR3b', 'peptide', 'MHC', 'binder']]
test = test[['CDR3b', 'peptide', 'MHC', 'binder']]
train.columns = ['cdr3b', 'peptide', 'mhc', 'binder']
test.columns = ['cdr3b', 'peptide', 'mhc', 'binder']

# Remove any duplicates
train = train.drop_duplicates()
test = test.drop_duplicates()

# Filter datasets.
train = filter_dataset(train)
test = filter_dataset(test)

# Assert no common peptides between train and test sets.
assert len(set(train.peptide).intersection(set(test.peptide))) == 0

# Keep peptide with more (resp. less) than 20 samples for the train (resp. test) set.
train = train[train['peptide'].map(train['peptide'].value_counts()) >= 20]
test = test[test['peptide'].map(test['peptide'].value_counts()) < 20]

# Remove close samples.
train, test = remove_close_samples(train, test)

# Random pairing to obtain the negative samples.
train_tmp, test_tmp = train.copy(), test.copy()
train_sampled = random_pairing(train_tmp)
test_sampled = random_pairing(test_tmp)
train_sampled['fold'] = 2
test_sampled['fold'] = 1
train_sampled['peptide_cluster'] = train_sampled['fold']
test_sampled['peptide_cluster'] = test_sampled['fold']

# Contatenate to obtain the final dataset.
df = pd.concat([train_sampled, test_sampled], ignore_index=True)
df.head()

Unnamed: 0,cdr3b,peptide,mhc,binder,fold,peptide_cluster
0,CASSALASLNEQFF,FLKEKGGL,HLA-B*08,1.0,2,2
1,CASSFTPYNEQFF,ELAGIGILTV,HLA-A*02,1.0,2,2
2,CASSPQGLGTEAFF,ELAGIGILTV,HLA-A*02,1.0,2,2
3,CAEGQGFVGQPQHF,ELAGIGILTV,HLA-A*02,1.0,2,2
4,CASLRSAVWADTQYF,ELAGIGILTV,HLA-A*02,1.0,2,2


In [11]:
df.to_csv(folder_dataset_out / 'models_benchmark_dataset.csv', index=False)