In [1]:
import os
import sys
import uuid

import random

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Dataset preprocessing

In [2]:
vdjdb_raw = pd.read_csv('data/raw/vdjdb_paired_010923.tsv', sep='\t')
iedb_raw = pd.read_csv('data/raw/iedb_010923.csv')
mcpas_raw = pd.read_csv('data/raw/mcpas-tcr_010923.csv', encoding='latin1')

  iedb_raw = pd.read_csv('data/raw/iedb_010923.csv')
  mcpas_raw = pd.read_csv('data/raw/mcpas-tcr_010923.csv', encoding='latin1')


## VDJdb

In [3]:
vdj_cols = ['complex.id', 'CDR3', 'V', 'J', 'Species', 'MHC A','MHC class', 'Epitope']
print('VDJdb raw:', vdjdb_raw.shape)

vdjdb = vdjdb_raw.dropna()
vdjdb_a = vdjdb[vdjdb['Gene']=='TRA'][vdj_cols].copy()
vdjdb_b = vdjdb[vdjdb['Gene']=='TRB'][vdj_cols].copy()

vdjdb = pd.merge(vdjdb_a, vdjdb_b, on='complex.id')
print('VDJdb paired:', vdjdb.shape)

assert (vdjdb['Species_x'] == vdjdb['Species_y']).all()
assert (vdjdb['Epitope_x'] == vdjdb['Epitope_y']).all()
assert (vdjdb['MHC A_x'] == vdjdb['MHC A_y']).all()

vdj_cols = ['complex.id', 'CDR3_x', 'V_x', 'J_x', 'CDR3_y', 'V_y', 'J_y', 'Species_y', 'MHC A_y', 'MHC class_y', 'Epitope_y']
vdjdb = vdjdb[vdj_cols].copy()
vdj_cols = ['complex.id', 'cdr3a', 'va', 'ja', 'cdr3b', 'vb', 'jb', 'species', 'mhc', 'mhc_class', 'epitope']
vdjdb.columns = vdj_cols

# final data formating
try:
    vdjdb = vdjdb.drop('complex.id', axis=1)
except:
    pass
vdjdb = vdjdb.replace('MHCI', 1) # name formatting
vdjdb = vdjdb.replace('HomoSapiens', 'human') # name formatting
vdjdb = vdjdb[vdjdb['species']=='human'].copy() # keep only humans
vdjdb = vdjdb.drop_duplicates()
print("preprocessed vdjdb:", vdjdb.shape)
vdjdb.head()


VDJdb raw: (3050, 17)
VDJdb paired: (1397, 15)
preprocessed vdjdb: (1102, 10)


Unnamed: 0,cdr3a,va,ja,cdr3b,vb,jb,species,mhc,mhc_class,epitope
0,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,human,HLA-B*08,1,FLKEKGGL
1,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,human,HLA-B*08,1,FLKEKGGL
2,CAYRPPGTYKYIF,TRAV38-2/DV8*01,TRAJ40*01,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,human,HLA-B*08,1,FLKEKGGL
3,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,human,HLA-B*08,1,FLKEQGGL
4,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,human,HLA-B*08,1,FLKEQGGL


## IEDB

IEDB queried for MHC 1 and human only

In [4]:
subset = ['Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated']
iedb = iedb_raw.dropna(subset=subset)
iedb = iedb.astype('str')
print("Full IEDB:", iedb.shape)

iedb_base_cols = ['Group Receptor ID', 'Receptor ID', 'Description', 'Organism', 'MHC Allele Names']

iedb_cols = [col for col in iedb.columns if "Curated" in col and ("CDR3" in col or "V" in col or "J" in col)]
iedb_cols = ['Description', 'Organism', 'MHC Allele Names', 'Curated Chain 1 V Gene', 'Curated Chain 1 J Gene', 'Chain 1 CDR3 Curated',  
            'Curated Chain 2 V Gene','Curated Chain 2 J Gene', 'Chain 2 CDR3 Curated']
iedb = iedb[iedb_cols]
print("IEDB Selected Columns:", iedb.shape)

iedb = iedb.replace('nan', np.nan)
iedb = iedb.dropna()
iedb.columns = ['epitope', 'species', 'mhc', 'va', 'ja', 'cdr3a', 'vb', 'jb', 'cdr3b']
iedb['mhc_class'] = 1
iedb['species'] = 'human' # IEDB filtered for only human

iedb = iedb[vdjdb.columns]
print('IEDB remove nan:', iedb.shape)
iedb = iedb.drop_duplicates()
print('IEDB remove duplicates:', iedb.shape)

# we need the CDR3 ends for 3D generation, one way to get this is to check if the sequence starts with C
iedb = iedb[iedb['cdr3a'].str.startswith('C')]
iedb = iedb[iedb['cdr3b'].str.startswith('C')]
print('IEDB with cdr3 ends only:', iedb.shape)

# VJ name formating for cases where V, J gene starts with 'TCR' instead of 'TR'
iedb = iedb.replace('TCR', 'TR', regex=True)
iedb

Full IEDB: (24032, 71)
IEDB Selected Columns: (24032, 9)
IEDB remove nan: (6093, 10)
IEDB remove duplicates: (6002, 10)
IEDB with cdr3 ends only: (5483, 10)


Unnamed: 0,cdr3a,va,ja,cdr3b,vb,jb,species,mhc,mhc_class,epitope
81,CAVRPTSGGSYIPTF,TRAV21*01,TRAJ6*01,CASSYVGNTGELFF,TRBV6-5*01,TRBJ2-2*01,human,HLA-A*02:01,1,SLLMWITQC
173,CAGGTGNQFYF,TRAV35*02,TRAJ49*01,CAISEVGVGQPQHF,TRBV10-3,TRBJ1-5*01,human,HLA-A*02:01,1,AAGIGILTV
617,CALSEAGTGGSYIPTF,TRAV19,TRAJ6,CASSMFVGQPQHF,TRBV19,TRBJ1-5,human,HLA-A*02:01,1,GILGFVFTL
618,CAVSVEETSGSRLTF,TRAV41,TRAJ58,CASSFFHNNEQFF,TRBV19,TRBJ2-1,human,HLA-A*02:01,1,GILGFVFTL
619,CAYRSARDSSYKLIF,TRAV38-2/DV8*01,TRAJ12*01,CASSDHSVTGISSPLHF,TRBV7-9*03,TRBJ1-6*02,human,HLA-B7,1,TPRVTGGGAM
...,...,...,...,...,...,...,...,...,...,...
24299,CGAGETSGSRLTF,TRAV21,TRAJ58,CSVNLGGPTDTQYF,TRBV29-1,TRBJ2-3,human,HLA-B*07:02,1,KPVETSNSF
24300,CALEGSQGNLIF,TRAV9-2,TRAJ42,CSVPDGAEPYGYTF,TRBV20-1,TRBJ1-2,human,HLA-A*01:01,1,TTDPSFLGRY
24301,CLVGNTGGFKTIF,TRAV4,TRAJ9,CSVPDRGNTEAFF,TRBV29-1,TRBJ1-1,human,HLA-B*07:02,1,SPRWYFYYL
24303,CAPSRHAGNNRKLIW,TRAV9-2,TRAJ38,CSVQGGTNEKLFF,TRBV29-1,TRBJ1-4,human,HLA-A*01:01,1,VSDGGPNLY


## McPAS-TCR

In [5]:
mcpas_cols = ['CDR3.alpha.aa', 'CDR3.beta.aa', 'Species', 'Epitope.peptide', 'MHC', 'TRAV', 'TRAJ', 'TRBV', 'TRBJ']
mcpas = mcpas_raw[mcpas_cols].copy()
print('McPAS-TCR raw:', mcpas.shape)
mcpas = mcpas.dropna()
print('McPAS-TCR drop na:', mcpas.shape)
mcpas = mcpas.drop_duplicates()
print('McPAS-TCR drop na:', mcpas.shape)

# formatting
mcpas['Species'] = mcpas['Species'].str.lower()
mcpas.columns = ['cdr3a', 'cdr3b', 'species', 'epitope', 'mhc', 'va', 'ja', 'vb', 'jb']
mcpas['mhc_class'] = 1
mcpas = mcpas[vdjdb.columns].copy()
mcpas 

McPAS-TCR raw: (39985, 9)
McPAS-TCR drop na: (3124, 9)
McPAS-TCR drop na: (2914, 9)


Unnamed: 0,cdr3a,va,ja,cdr3b,vb,jb,species,mhc,mhc_class,epitope
73,CALGLMSNYNVLYF,TRAV4,TRAJ14-4,CASSSGLGGTLYF,TRBV10,TRBJ2-4,mouse,H-2Db,1,SSGVENPGGYCLTKW
80,CAAETTASLGKLQF,TRAV11,TRAJ9,CASGDHGLSYEQYF,TRBV13-3,TRBJ2-6,mouse,H-2b,1,DEPLTSLTPRCNTAWNRLKL
84,CALGDRGSGGSNYK,TRAV4,TRAJ3DT,CAWSRTGGNSDYTF,TRBV31,TRBJ1-2,mouse,H-2b,1,DEPLTSLTPRCNTAWNRLKL
120,CAAEASSSFSKLVF,TRAV11-1,TRAJ42,CASAPDRGGERLF,TRBV8-2,TRBJ1-4,mouse,H-2q,1,GPEGAQGPRGEPGTP
121,CAAEASSSFSKLVF,TRAV11-1,TRAJ42,CASAPDRGGERLF,TRBV13-2,TRBJ1-4,mouse,H-2q,1,GPEGAQGPRGEPGTP
...,...,...,...,...,...,...,...,...,...,...
39028,CALDGPSNTGKLIF,TRAV16,TRAJ37,CATSESSGQTYEQYF,TRBV15,TRBJ2-2,human,HLA-A2:01,1,FLCMKALLL
39029,CATDAEGNNRLAF,TRAV17,TRAJ7,CASSIFGGGLGEQFF,TRBV19,TRBJ2-7,human,HLA-A2:01,1,FLCMKALLL
39030,CGAVGYQKVTF,TRAV34,TRAJ13,CALNGEISYNEQFF,TRBV2,TRBJ2-2,human,HLA-A2:01,1,FLCMKALLL
39031,CAVIWYNNNDMRF,TRAV8-1,TRAJ43,CASSQGVNTGELFF,TRBV4-2,TRBJ2-1,human,HLA-A2:01,1,FLCMKALLL


## Combine

In [6]:
positives = pd.concat((iedb, vdjdb, mcpas), axis=0)
print("concate db:", positives.shape)
positives = positives.drop_duplicates(subset=['cdr3a', 'cdr3b', 'epitope'], keep='last', ignore_index=True)
print("concate db remove duplicates:", positives.shape)
positives['uuid'] = [uuid.uuid4() for _ in range(len(positives.index))]

positives

concate db: (9499, 10)
concate db remove duplicates: (8433, 10)


Unnamed: 0,cdr3a,va,ja,cdr3b,vb,jb,species,mhc,mhc_class,epitope,uuid
0,CAVSVEETSGSRLTF,TRAV41,TRAJ58,CASSFFHNNEQFF,TRBV19,TRBJ2-1,human,HLA-A*02:01,1,GILGFVFTL,83715c0e-fc2c-406b-8b9e-7729ca9610c4
1,CAVNKGYGQNFVF,TRAV12-2*02,TRAJ26*01,CASSPAGISYNSPLHF,TRBV7-9*03,TRBJ1-6*01,human,HLA-B8,1,ELRRKMMYM,52c8108b-efe8-4b2b-8c97-0c8254b87d10
2,CAVRDSSYSGAGSYQLTF,TRAV3*01,TRAJ28*01,CASSRLAGASTDTQYF,TRBV7-3*01,TRBJ2-3*01,human,HLA-B8,1,QIKVRVDMV,534ca8a8-273e-4335-85d1-9087b9219002
3,CAVSDYGQNFVF,TRAV21*01,TRAJ26*01,CASSRLSSNTDTQYF,TRBV7-3*01,TRBJ2-3*01,human,HLA-B8,1,QIKVRVDMV,77d066be-28b8-4831-ab68-1a36ab7640ce
4,CATAQVYSGGGADGLTF,TRAV17*01,TRAJ45*01,CASSRLAGNTDTQYF,TRBV7-3*01,TRBJ2-3*01,human,HLA-B8,1,QIKVRVDMV,57b98cbb-8016-4af7-bdd9-cc40277d944a
...,...,...,...,...,...,...,...,...,...,...,...
8428,CALDGPSNTGKLIF,TRAV16,TRAJ37,CATSESSGQTYEQYF,TRBV15,TRBJ2-2,human,HLA-A2:01,1,FLCMKALLL,569898b6-fd50-442b-9909-d6b947a41efa
8429,CATDAEGNNRLAF,TRAV17,TRAJ7,CASSIFGGGLGEQFF,TRBV19,TRBJ2-7,human,HLA-A2:01,1,FLCMKALLL,ced3abaa-ea80-4c13-88d4-7b39a20b6ff7
8430,CGAVGYQKVTF,TRAV34,TRAJ13,CALNGEISYNEQFF,TRBV2,TRBJ2-2,human,HLA-A2:01,1,FLCMKALLL,1fe897dd-350a-4cb1-882c-c4236c1b7d00
8431,CAVIWYNNNDMRF,TRAV8-1,TRAJ43,CASSQGVNTGELFF,TRBV4-2,TRBJ2-1,human,HLA-A2:01,1,FLCMKALLL,6f169eb9-55d9-484b-9ab2-5670f4b018bd


In [7]:
complex = []
for i in range(len(positives)):
    complex.append((positives['cdr3a'].iloc[i], positives['cdr3b'].iloc[i], positives['epitope'].iloc[i]))
positives['complex'] = complex
positives_dict = positives.drop_duplicates(subset='complex').set_index('complex', drop=True).to_dict(orient='index')
try:
    positives = positives.drop('complex', axis=1)
except:
    pass
print('unique tcr-peptide pairs:', len(positives_dict.keys()))

unique tcr-peptide pairs: 8433


# Negative sample generation

In [8]:
# create a lookup dictionary of all positive binding samples
def lookup_dict(df: pd.DataFrame, cdr3a: str, cdr3b: str, epitope: str) -> dict:
    complex = []
    for i in range(len(df.index)):
        complex.append((df[cdr3a].iloc[i], df[cdr3b].iloc[i], df[epitope].iloc[i]))
    df['complex'] = complex
    df_dict = df.drop_duplicates(subset='complex').set_index('complex', drop=True).to_dict(orient='index')
    return df_dict

# vdjdb
vdjdb_lc_raw = pd.read_csv('data/raw/vdjdb_low_confidence.tsv', sep='\t')
vdjdb_lc_a = vdjdb_lc_raw[vdjdb_lc_raw['Gene']=='TRA'].copy()
vdjdb_lc_b = vdjdb_lc_raw[vdjdb_lc_raw['Gene']=='TRB'].copy()
vdjdb_lc = pd.merge(vdjdb_lc_a, vdjdb_lc_b, on='complex.id')

vdjdb_lc_dict = lookup_dict(vdjdb_lc, 'CDR3_x', 'CDR3_y', 'Epitope_y')

#iedb
subset = ['Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated']
iedb_lc = iedb_raw.dropna(subset=subset)
iedb_lc = iedb_lc.astype('str')

iedb_lc_dict = lookup_dict(iedb_lc, 'Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated', 'Description')

#mcpas-tcr
mcpas_lc = mcpas_raw.dropna(subset=['CDR3.alpha.aa', 'CDR3.beta.aa', 'Epitope.peptide'])

mcpas_lc_dict = lookup_dict(mcpas_lc, 'CDR3.alpha.aa', 'CDR3.beta.aa', 'Epitope.peptide')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['complex'] = complex


In [11]:
i = 0
negatives = []
while i<len(positives.index):
    sample_idx = random.choice([k for k in range(len(positives)) if k!=i])
    neg_complex = (positives['cdr3a'].iloc[sample_idx], positives['cdr3b'].iloc[sample_idx], positives['epitope'].iloc[i])
    try:
        # sample already exists in either vdjdb/iedb/mcpas-tcr and it's positive --> we do not want this
        vdjdb_lc_dict[neg_complex]
        iedb_lc_dict[neg_complex]
        mcpas_lc_dict[neg_complex]
        
    except:
        # negative sample is not positive
        sample = positives.iloc[sample_idx].copy()
        sample['epitope'] =  positives['epitope'].iloc[i]
        sample['mhc'] = positives['mhc'].iloc[i]
        if tuple(sample.values) not in negatives: # check if generated negative already exists
            negatives.append(tuple(sample.values))
            i+=1
negatives = pd.DataFrame(negatives, columns=positives.columns)
print('generated negatives:', negatives.shape)
negatives = negatives.drop_duplicates() # this confirms there are no duplicates
print('generated negatives drop duplicates:', negatives.shape)

negatives['uuid'] = [uuid.uuid4() for _ in range(len(negatives.index))]
print('negative samples:', negatives.shape)
negatives

generated negatives: (8433, 11)
generated negatives drop duplicates: (8433, 11)
negative samples: (8433, 11)


Unnamed: 0,cdr3a,va,ja,cdr3b,vb,jb,species,mhc,mhc_class,epitope,uuid
0,CALSGTTDSWGKLQF,TRAV9-2,TRAJ24,CASTERGPADTQYF,TRBV27,TRBJ2-3,human,HLA-A*02:01,1,GILGFVFTL,d5b46332-8afc-4b52-99dc-798ff7106c9e
1,CALGDLSGGNSGGYQKVTF,TRAV1*01,TRAJ13*01,CASSQDTGAGGSHEQYF,TRBV4-1*01,TRBJ2-7*01,human,HLA-B8,1,ELRRKMMYM,e542e3a8-6217-4ec3-8591-dd1681cf0cad
2,CAAPLMNSGYSTLTF,TRAV1-1,TRAJ11,CASTLAHGTTMGYTF,TRBV13,TRBJ1-2,human,HLA-B8,1,QIKVRVDMV,bf5a5a27-97d3-40a6-9423-15925fa8c0b9
3,CAVNVDTGFQKLVF,TRAV12-2,TRAJ8,CASSLSYRGNTEAFF,TRBV27,TRBJ1-1,human,HLA-B8,1,QIKVRVDMV,d144c68e-0488-4d2d-804e-c880e981b9fb
4,CAVDDLYSNYQLIW,TRAV41-01,TRAJ3-01,CASSLSGGINEQFF,TRBV19:02,TRBJ2-1:01,human,HLA-B8,1,QIKVRVDMV,84e4f7fb-c6cc-4f0b-a1e5-b5e3669d1e0f
...,...,...,...,...,...,...,...,...,...,...,...
8428,CAALNTDKLIF,TRAV25:01,TRAJ54:01,CASSLWGGTEAFF,TRBV27:01,TRBJ1-1:01,human,HLA-A2:01,1,FLCMKALLL,e538cd0d-b618-4529-b1e1-4f894e8ccafc
8429,CAMREGRSARLMF,TRAV14/DV4*01,TRAJ31*01,CSARGGFRSGGGTDEQFF,TRBV20-1*01,TRBJ2-1*01,human,HLA-A2:01,1,FLCMKALLL,9905703c-01c0-4c07-bef5-e71c1a341c24
8430,CAIGFQKLVF,TRAV13-1,TRAJ8,CASSLPKTINSEQYF,TRBV28,TRBJ2-7,human,HLA-A2:01,1,FLCMKALLL,a1ac6096-e0a6-4aa3-a206-55834aa56219
8431,CAVSSNNRIFF,TRAV3-3:01,TRAJ3-01,CACGGADSAETLYF,TRBV1-01,TRBJ2-3:01,mouse,HLA-A2:01,1,FLCMKALLL,ffac975e-2d49-4a12-89c2-d8451495652d


# Saving Data

In [12]:
save_path = 'data/preprocessed'
positives.to_csv(os.path.join(save_path, 'positives.tsv'), sep='\t')
negatives.to_csv(os.path.join(save_path, 'negatives.tsv'), sep='\t')