# Look at data

In [1]:
import numpy as np
import pandas as pd
import sys
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pyrepseq as prs
import tidytcells as tt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#move to correct directory
os.chdir('/Users/isabellasodi/Documents/UCL/PhD/CRUK_datathon_2025')

In [3]:
# load data
#tcr_data = pd.read_csv('input/tcrictionary_tabular.csv' )
tcr_data_raw = pd.read_csv('/Volumes/ritd-ag-project-rd0017-bmcha43/CRUK_datathon_2025/raw_data/tcrictionary_tabular.csv' )


## Pre-process

In [4]:
# expand out so that each study is a row
tcr_data = tcr_data_raw.assign(Studies=tcr_data_raw['Studies'].str.split(',')).explode('Studies')
tcr_data = tcr_data.reset_index(drop=True)
tcr_data['Studies'] = tcr_data['Studies'].str.strip()

In [5]:
# remove anything after a + for some epitopes with PTM info
tcr_data['epitope'] = tcr_data['epitope'].str.split('+').str[0].str.strip()

In [6]:
# remove *,  and X (based on amino acids) in alpha chain 
# use tidy t cells, go through and remove cells with not allowed
tcr_data['CDR3A'] = tcr_data['CDR3A'].apply(lambda x: tt.aa.standardize(x, log_failures=False) if pd.notna(x) else x)
tcr_data['CDR3B'] = tcr_data['CDR3B'].apply(lambda x: tt.aa.standardize(x, log_failures=False) if pd.notna(x) else x)

# Filtering

In [7]:
# only human
tcr_data = tcr_data[ tcr_data['TCR species'] == 'HomoSapiens'].copy()

# only with epitopes
tcr_data = tcr_data[ ~tcr_data['epitope'].isna()].copy()

# only betas
tcr_data = tcr_data[ ~tcr_data['CDR3B'].isna()].copy()

# only class 1
tcr_data = tcr_data[tcr_data['MHC class'] == 1]
tcr_data

print('Unique Left')
print('epitopes:', tcr_data['epitope'].nunique())
print('beta:', tcr_data['CDR3B'].nunique(), '\n')


Unique Left
epitopes: 2069
beta: 156662 



## Length

In [8]:
# get stats
tcr_data['CDR3B_length'] = tcr_data['CDR3B'].str.len().astype('Int64')
tcr_data['epitope_length'] = tcr_data['epitope'].str.len().astype('Int64')

mode_CDR3B = tcr_data['CDR3B_length'].value_counts().index[0]
mode_epitope = tcr_data['epitope_length'].value_counts().index[0]

print('Mode: ')
print('beta:', mode_CDR3B)
print('epitope:', mode_epitope)

Mode: 
beta: 15
epitope: 9


In [9]:
tcr_data_unfiltered = tcr_data.copy() # save unfiltered just in case

tcr_data = tcr_data[tcr_data['CDR3B_length'] == mode_CDR3B]
tcr_data = tcr_data[tcr_data['epitope_length'] == mode_epitope]

print('Unique Left')
print('epitopes:', tcr_data['epitope'].nunique())
print('beta:', tcr_data['CDR3B'].nunique(), '\n')


Unique Left
epitopes: 757
beta: 18205 



# Split

## Cancer

In [10]:
# remove cancer studies
cancer_studies = ['PMID:38039963', 'PMID:27959684', 'PMID:32461371']
#study_pattern = '|'.join(cancer_studies)

tcr_data_cancer = tcr_data[ tcr_data['Studies'].isin(cancer_studies)] # need to do str match for multiple studies
#tcr_data_cancer = tcr_data[tcr_data['Studies'].str.contains(study_pattern, na=False)] # no need as now expanded

tcr_data_train = tcr_data[ ~tcr_data['Studies'].isin(cancer_studies)]
#tcr_data_train = tcr_data[~tcr_data['Studies'].str.contains(study_pattern, na=False)]

print('Training')
print('epitopes:', tcr_data_train['epitope'].nunique())
print('beta:', tcr_data_train['CDR3B'].nunique(), '\n')

print('Cancer')
print('epitopes:', tcr_data_cancer['epitope'].nunique())
print('beta:', tcr_data_cancer['CDR3B'].nunique())


Training
epitopes: 754
beta: 18196 

Cancer
epitopes: 3
beta: 10


In [11]:
# check if cancer epitopes are in train

cancer_epitopes = tcr_data_cancer['epitope'].unique()
print(cancer_epitopes)
print( len(cancer_epitopes))

epitope_pattern = '|'.join(cancer_epitopes)
leaking_epitopes = tcr_data_train[tcr_data_train['epitope'].str.contains(epitope_pattern, na=False)]

print('leaking epitopes:', len(leaking_epitopes))
print( leaking_epitopes['epitope'], '\n')

# remove leaking epitopes in train
tcr_data_train = tcr_data_train[~tcr_data_train['epitope'].str.contains(epitope_pattern, na=False)]
tcr_data_train

# add leaking epitopes to validate
tcr_data_cancer = pd.concat([tcr_data_cancer, leaking_epitopes])

print('Training')
print('epitopes:', tcr_data_train['epitope'].nunique())
print('beta:', tcr_data_train['CDR3B'].nunique(), '\n')

print('Cancer (w/ leaking)')
print('epitopes:', tcr_data_cancer['epitope'].nunique())
print('beta:', tcr_data_cancer['CDR3B'].nunique())

['NENLDLKEL' 'NENLDLQEL' 'GADGVGKSA']
3
leaking epitopes: 0
Series([], Name: epitope, dtype: object) 

Training
epitopes: 754
beta: 18196 

Cancer (w/ leaking)
epitopes: 3
beta: 10


## Test (10%)

In [12]:
# remove additional 10% of epitopes
all_epitopes = tcr_data_train[['epitope', 'epitope_length']].drop_duplicates()
print('Unique epitopes in train/test:', tcr_data_train['epitope'].nunique(), '\n')

test_epitopes = all_epitopes.sample( round(len(all_epitopes)*0.1 ), random_state=27)
print('Unique epitopes in test:', test_epitopes['epitope'].nunique(), '\n')
# validation_random_epitopes['epitope'].nunique() # not sure why has 193

tcr_data_test = tcr_data_train[ tcr_data_train['epitope'].isin(test_epitopes['epitope'])]

assert test_epitopes['epitope'].nunique() == 75

tcr_data_train = tcr_data_train[ ~tcr_data_train['epitope'].isin(test_epitopes['epitope'])]

print('Training (unique)')
print('epitopes:', tcr_data_train['epitope'].nunique())
print('beta:', tcr_data_train['CDR3B'].nunique())
print('total:', tcr_data_train.size, '\n')

print('Test (unique)')
print('epitopes:', tcr_data_test['epitope'].nunique())
print('beta:', tcr_data_test['CDR3B'].nunique())
print('total:', tcr_data_test.size)


Unique epitopes in train/test: 754 

Unique epitopes in test: 75 

Training (unique)
epitopes: 679
beta: 17149
total: 402753 

Test (unique)
epitopes: 75
beta: 1105
total: 16887


In [13]:
# check if test epitopes are in train (leaking)

test_epitopes = tcr_data_test['epitope'].unique()
print('test epitopes:', len(test_epitopes))
print(test_epitopes[1:10])

epitope_pattern = '|'.join(test_epitopes)
leaking_epitopes = tcr_data_train[tcr_data_train['epitope'].str.contains(epitope_pattern, na=False)]

print('leaking epitopes:', leaking_epitopes['epitope'].nunique)
print( leaking_epitopes['epitope'].unique, '\n')

# remove leaking epitopes in train
tcr_data_train = tcr_data_train[~tcr_data_train['epitope'].str.contains(epitope_pattern, na=False)]
tcr_data_train

# add leaking epitopes to validate
tcr_data_test = pd.concat([tcr_data_test, leaking_epitopes])

print('Training')
print('epitopes:', tcr_data_train['epitope'].nunique())
print('beta:', tcr_data_train['CDR3B'].nunique(), '\n')

print('Test (w/ leaking)')
print('epitopes:', tcr_data_test['epitope'].nunique())
print('beta:', tcr_data_test['CDR3B'].nunique())

test epitopes: 75
['FVDGVPFVV' 'FLRGRAYGL' 'FPRPWLHGL' 'YFPLQSYGF' 'NLIDSYFVV' 'CTELKLSDY'
 'LLLGIGILV' 'VVLSWAPPV' 'LLYDANYFL']
leaking epitopes: <bound method IndexOpsMixin.nunique of Series([], Name: epitope, dtype: object)>
<bound method Series.unique of Series([], Name: epitope, dtype: object)> 

Training
epitopes: 679
beta: 17149 

Test (w/ leaking)
epitopes: 75
beta: 1105


# Save

In [None]:
print('Training')
print('epitopes:', tcr_data_train['epitope'].nunique())
print('beta:', tcr_data_train['CDR3B'].nunique(), '\n')
print('total:', tcr_data_train.size)

print('Test')
print('epitopes:', tcr_data_test['epitope'].nunique())
print('beta:', tcr_data_test['CDR3B'].nunique(), '\n')
print('total:', tcr_data_test.size)

print('Cancer Test')
print('epitopes:', tcr_data_cancer['epitope'].nunique())
print('beta:', tcr_data_cancer['CDR3B'].nunique())
print('total:', tcr_data_test.size)

Training
epitopes: 679
alpha: 5455
beta: 17149 

Test
epitopes: 75
alpha: 145
beta: 1105 

Cancer Test
epitopes: 3
alpha: 10
beta: 10


In [15]:
tcr_data_train.to_csv(f'/Volumes/ritd-ag-project-rd0017-bmcha43/CRUK_datathon_2025/processed_data/train_fixed_lengths.csv')
tcr_data_test.to_csv(f'/Volumes/ritd-ag-project-rd0017-bmcha43/CRUK_datathon_2025/processed_data/test_fixed_lengths.csv')
tcr_data_cancer.to_csv(f'/Volumes/ritd-ag-project-rd0017-bmcha43/CRUK_datathon_2025/processed_data/cancer_fixed_lengths.csv')