In [1]:
import pandas as pd
import os
import random
import uuid

from functools import partial
from src.processing.graph import read_pdb_to_dataframe, seperate_tcr_pmhc, build_residue_graph, compute_residue_embedding
from graphein.protein.visualisation import plotly_protein_structure_graph
from graphein.protein.features.sequence.embeddings import compute_esm_embedding

To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


# IEDB 3D

Ponomarenko J, Papangelopoulos N, Zajonc DM, Peters B, Sette A, Bourne PE. IEDB-3D: structural data within the immune epitope database. Nucleic Acids Res. 2011 Jan;39(Database issue):D1164-70. doi: 10.1093/nar/gkq888. Epub 2010 Oct 28. PMID: 21030437; PMCID: PMC3013771.

In [39]:
iedb_3d_raw = pd.read_csv('../data/raw/iedb_3d_resolved.csv')
iedb_3d_raw.head()

Unnamed: 0,tcell_id,pubmed_id,Epitope_ID,3DViewer_link,receptor_id,pdb_id,tcr_c1_pdb_chain,tcr_c2_pdb_chain,mhc_c1_pdb_chain,mhc_c2_pdb_chain,...,chain2_cdr2_end_calculated.1,chain2_cdr3_start_calculated.1,chain2_cdr3_end_calculated.1,chain2_cdr1_pdb_pos,chain2_cdr2_pdb_pos,chain2_cdr3_pdb_pos,cdr_missing_res,R_Free,ep_len,Cluster
0,1511834,17644531,59278,http://www.iedb.org/3dViewer.php?complex=1353792,94,2P5E,D,E,A,B,...,51,89,100,"'25', '26', '27', '28', '29'","'47', '48', '49', '50', '51', '52'","'90', '91', '92', '93', '94', '95', '96', '97'...",0,0.17777,9.0,1.A2_B2.a2
1,1511835,17644531,59278,http://www.iedb.org/3dViewer.php?complex=1353793,94,2P5W,D,E,A,B,...,52,90,101,"'25', '26', '27', '28', '29'","'47', '48', '49', '50', '51', '52'","'90', '91', '92', '93', '94', '95', '96', '97'...",0,0.16532,9.0,1.A2_B2.a2
2,1511833,17644531,59278,http://www.iedb.org/3dViewer.php?complex=1353791,94,2PYE,D,E,A,B,...,52,90,101,"'25', '26', '27', '28', '29'","'47', '48', '49', '50', '51', '52'","'90', '91', '92', '93', '94', '95', '96', '97'...",0,0.19172,9.0,1.A2_B2.a2
3,2827288,27036003,538549,http://www.iedb.org/3dViewer.php?complex=1357063,820,5HHO,D,E,A,B,...,52,90,100,"'29', '30', '31', '32', '33'","'51', '52', '53', '54', '55', '56'","'94', '95', '96', '97', '98', '99', '100', '10...",0,0.2359,9.0,2.A5_B5.a2
4,2833312,27238970,12941,http://www.iedb.org/3dViewer.php?complex=1357165,1161,5E9D,D,E,A,B,...,52,90,103,"'27', '28', '29', '30', '31'","'49', '50', '51', '52', '53', '54'","'92', '93', '94', '95', '96', '97', '98', '99'...",0,0.18,10.0,3.A6_B6.a2


### Process data

Script to download files from RCSB http file download services.

The input file requires a comma-separated list of PDB ids

In [40]:
iedb_cols = ['pdb_id', 'epitope_seq', 'H_ORGANISM_NAME', 'mhc_allele_name',
            'chain1_v_gene_calculated', 'chain1_j_gene_calculated', 'chain1_cdr3_seq_calculated',  
            'chain2_v_gene_calculated','chain2_j_gene_calculated', 'chain2_cdr3_seq_calculated']
iedb_3d = iedb_3d_raw[iedb_cols].copy()
print("IEDB Selected Columns:", iedb_3d.shape)

# ignore = ['2P5W', '2P5E']
ignore = []
iedb_3d = iedb_3d[~iedb_3d['pdb_id'].isin(ignore)]

iedb_3d[['pdb_id']].to_csv('../data/utils/iedb_3d_pdb_codes.txt', header=None, index=None)

iedb_3d.columns = ['tcr_id', 'epitope', 'species', 'mhc', 'va', 'ja', 'cdr3a', 'vb', 'jb', 'cdr3b']
iedb_3d.insert(1, 'pmhc_id', iedb_3d['tcr_id'])
iedb_3d['mhc_class'] = 1
iedb_3d['binding'] = 1 # these are all binding

iedb_3d.head()

IEDB Selected Columns: (60, 10)


Unnamed: 0,tcr_id,pmhc_id,epitope,species,mhc,va,ja,cdr3a,vb,jb,cdr3b,mhc_class,binding
0,2P5E,2P5E,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ6*01,AVRPLLDGTYIPT,TRBV6-5*01,TRBJ2-2*01,ASSYLGNTGELF,1,1
1,2P5W,2P5W,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ6*01,AVRPLLDGTYIPT,TRBV6-5*01,TRBJ2-2*01,ASSYLGNTGELF,1,1
2,2PYE,2PYE,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ6*01,AVRPLLDGTYIPT,TRBV6-5*01,TRBJ2-2*01,ASSYLGNTGELF,1,1
3,5HHO,5HHO,GILEFVFTL,Homo sapiens (human),HLA-A*02:01,TRAV27*01,TRAJ42*01,AGAGSQGNLI,TRBV19*01,TRBJ2-7*01,ASSIRSSYEQY,1,1
4,5E9D,5E9D,ELAGIGILTV,Homo sapiens (human),HLA-A*02:01,TRAV12-2*01,TRAJ24*02,AVTKYSWGKLQ,TRBV6-5*01,TRBJ2-7*01,ASRPGWMAGGVELY,1,1


### Generating negative samples

In [41]:
def load_tc_hard(dir: str) -> pd.DataFrame:
    full = pd.DataFrame()
    for path in os.listdir(dir):
        split = pd.read_csv(os.path.join(dir, path), dtype='str')
    full = pd.concat((full, split))
    full = full.drop_duplicates(subset=['cdr3.alpha','cdr3.beta', 'antigen.epitope'])
    return full


assay_ = load_tc_hard('../data/preprocessed/tc-hard/ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-neg-assays.full')
random_ = load_tc_hard('../data/preprocessed/tc-hard/ds.hard-splits/pep+cdr3b+cdr3a+MHC/train/only-sampled-negs.full')

tc_hard = pd.concat((assay_, random_)).drop_duplicates(subset=['cdr3.alpha','cdr3.beta', 'antigen.epitope'])
print(tc_hard.shape)
tc_hard_pos = tc_hard[tc_hard['label']=='1'].copy()
print(tc_hard_pos.shape)
tc_hard_neg = tc_hard[tc_hard['label']=='0'].copy()
print(tc_hard_neg.shape)


(107626, 14)
(25687, 14)
(81939, 14)


In [49]:
vdjdb_raw = pd.read_csv('../data/raw/vdjdb_paired_010923.tsv', sep='\t')
iedb_raw = pd.read_csv('../data/raw/iedb_010923.csv')
mcpas_raw = pd.read_csv('../data/raw/mcpas-tcr_010923.csv', encoding='latin1')

# create a lookup dictionary of all positive binding samples
def lookup_dict(df: pd.DataFrame, cdr3a: str, cdr3b: str, epitope: str) -> dict:
    complex = []
    for i in range(len(df.index)):
        if str(df[cdr3a].iloc[i]).startswith('C') or str(df[cdr3b].iloc[i]).startswith('C'):
            complex.append((df[cdr3a].iloc[i][1:-1], df[cdr3b].iloc[i][1:-1], df[epitope].iloc[i]))
        else:
            complex.append((df[cdr3a].iloc[i], df[cdr3b].iloc[i], df[epitope].iloc[i]))
    df['complex'] = complex
    df_dict = df.drop_duplicates(subset='complex').set_index('complex', drop=True).to_dict(orient='index')
    return df_dict

# vdjdb
vdjdb_lc_raw = pd.read_csv('../data/raw/vdjdb_low_confidence.tsv', sep='\t')
vdjdb_lc_a = vdjdb_lc_raw[vdjdb_lc_raw['Gene']=='TRA'].copy()
vdjdb_lc_b = vdjdb_lc_raw[vdjdb_lc_raw['Gene']=='TRB'].copy()
vdjdb_lc = pd.merge(vdjdb_lc_a, vdjdb_lc_b, on='complex.id')

vdjdb_lc_dict = lookup_dict(vdjdb_lc, 'CDR3_x', 'CDR3_y', 'Epitope_y')

#iedb
subset = ['Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated']
iedb_lc = iedb_raw.dropna(subset=subset)
iedb_lc = iedb_lc.astype('str')

iedb_lc_dict = lookup_dict(iedb_lc, 'Chain 1 CDR3 Curated', 'Chain 2 CDR3 Curated', 'Description')

#mcpas-tcr
mcpas_lc = mcpas_raw.dropna(subset=['CDR3.alpha.aa', 'CDR3.beta.aa', 'Epitope.peptide'])

mcpas_lc_dict = lookup_dict(mcpas_lc, 'CDR3.alpha.aa', 'CDR3.beta.aa', 'Epitope.peptide')

# iedb_3d
iedb_3d_dict = lookup_dict(iedb_3d, 'cdr3a', 'cdr3b', 'epitope')

tc_hard_dict = lookup_dict(tc_hard_pos, 'cdr3.alpha','cdr3.beta', 'antigen.epitope')
tc_hard_neg_dict = lookup_dict(tc_hard_neg, 'cdr3.alpha','cdr3.beta', 'antigen.epitope')



Columns (8,10,11,15,16,17,18,20,21,22,23,25,30,31,36,37,44,45,46,47,48,49,50,51,52,59,60,65,66) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (13,17,24,26,28) have mixed types. Specify dtype option on import or set low_memory=False.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [56]:
i = 0
negatives = []
neg_complex_track = []
while i<len(iedb_3d.index):
    # sample a cdr3a/cdr3b pair and a another random peptide
    sample_idx = random.choice([k for k in range(len(iedb_3d))if k!=i])
    neg_complex = (iedb_3d['cdr3a'].iloc[sample_idx], iedb_3d['cdr3b'].iloc[sample_idx], iedb_3d['epitope'].iloc[i])
    # check if cdr3a-cdr3a-peptide combination not in either iedb_3d/vdjdb/iedb/mcpas
    # .get() will return None if not in dict
    in_iedb_3d_dict = iedb_3d_dict.get(neg_complex)
    in_vdjdb_lc_dict = vdjdb_lc_dict.get(neg_complex)
    in_iedb_lc_dict = iedb_lc_dict.get(neg_complex)
    in_mcpas_lc_dict = mcpas_lc_dict.get(neg_complex)
    in_tc_hard_dict = tc_hard_dict.get(neg_complex)
    in_tc_hard_neg_dict = tc_hard_neg_dict.get(neg_complex)

    if (in_iedb_3d_dict is None) and (in_vdjdb_lc_dict is None) and (in_iedb_lc_dict is None) \
        and (in_mcpas_lc_dict is None) and (in_tc_hard_dict is None):

        # not in either db --> TCR-pMHC complex is negative binding
        sample = iedb_3d.iloc[sample_idx].copy()
        sample['epitope'] =  iedb_3d['epitope'].iloc[i]
        sample['mhc'] = iedb_3d['mhc'].iloc[i]
        sample['pmhc_id'] = iedb_3d['pmhc_id'].iloc[i]
        if tuple(sample.values) not in negatives: # check if generated negative already exists
            negatives.append(tuple(sample.values))
            i+=1
        

print('nb of iterations:', i)
                            
negatives = pd.DataFrame(negatives, columns=iedb_3d.columns)
print('generated negatives:', negatives.shape)
negatives = negatives.drop_duplicates() # this confirms there are no duplicates
print('generated negatives drop duplicates:', negatives.shape)

negatives['binding'] = 0
negatives['uuid'] = [uuid.uuid4() for _ in range(len(negatives.index))]
iedb_3d['uuid'] = [uuid.uuid4() for _ in range(len(iedb_3d.index))]

print('negative samples:', negatives.shape)
negatives.head()

nb of iterations: 60
generated negatives: (60, 15)
generated negatives drop duplicates: (60, 15)
negative samples: (60, 15)


Unnamed: 0,tcr_id,pmhc_id,epitope,species,mhc,va,ja,cdr3a,vb,jb,cdr3b,mhc_class,binding,complex,uuid
0,3VXS,2P5E,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ12*01,AVRMDSSYKLI,TRBV7-9*01,TRBJ2-2*01,ASSSWDTGELF,1,0,"(AVRMDSSYKLI, ASSSWDTGELF, RYPLTLGWCF)",676a7dae-12a3-4dda-9ab7-468672123421
1,3HG1,2P5W,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV12-2*01,TRAJ27*01,AVNVAGKST,TRBV30*02,TRBJ2-2*01,AWSETGLGTGELF,1,0,"(AVNVAGKST, AWSETGLGTGELF, ELAGIGILTV)",83702b48-b859-4837-84fc-08dcde59f90d
2,3VXR,2PYE,SLLMWITQC,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ12*01,AVRMDSSYKLI,TRBV7-9*01,TRBJ2-2*01,ASSSWDTGELF,1,0,"(AVRMDSSYKLI, ASSSWDTGELF, RYPLTFGWCF)",7874c31b-c92a-4966-9ac7-971de23487fc
3,2P5E,5HHO,GILEFVFTL,Homo sapiens (human),HLA-A*02:01,TRAV21*01,TRAJ6*01,AVRPLLDGTYIPT,TRBV6-5*01,TRBJ2-2*01,ASSYLGNTGELF,1,0,"(AVRPLLDGTYIPT, ASSYLGNTGELF, SLLMWITQC)",f75281ea-f2f4-4392-bd8c-3b47eda3b118
4,5WKF,5E9D,ELAGIGILTV,Homo sapiens (human),HLA-A*02:01,TRAV30*01,TRAJ39*01,GLGDAGNMLT,TRBV11-2*01,TRBJ1-2*01,ASSLGQGLLYGYT,1,0,"(GLGDAGNMLT, ASSLGQGLLYGYT, GTSGSPIVNR)",6ab23c86-f017-41ea-b395-da5fe717ca99


In [48]:
# check if any overlap between negatives and positives
pd.merge(iedb_3d, negatives, on=['cdr3a', 'cdr3b', 'epitope'])

Unnamed: 0,tcr_id_x,pmhc_id_x,epitope,species_x,mhc_x,va_x,ja_x,cdr3a,vb_x,jb_x,...,species_y,mhc_y,va_y,ja_y,vb_y,jb_y,mhc_class_y,binding_y,complex_y,uuid_y


In [16]:
### DO NOT RUN TO PREVENT OVERWRITE
# concatenate positive and negative binding samples

# binding_df = pd.concat((iedb_3d, negatives), ignore_index=True)

# print('Total pos/neg binding df:', binding_df.shape)

# save_path = '../data/preprocessed'
# binding_df.to_csv(os.path.join(save_path, 'iedb_3d_binding.tsv'), sep='\t')

Total pos/neg binding df: (120, 15)


# Generate residue graphs

In [2]:
from functools import partial
from src.processing.graph import read_pdb_to_dataframe, seperate_tcr_pmhc, build_residue_graph, compute_residue_embedding
from graphein.protein.visualisation import plotly_protein_structure_graph
from graphein.protein.features.sequence.embeddings import compute_esm_embedding


tsv_path = '../data/preprocessed/iedb_3d_binding.tsv'
pdb_dir = '../data/pdb/iedb_3d_resolved'
pt_save_dir = '../data/graphs/iedb_3d_resolved'

pdb_codes = pd.read_csv('../data/utils/iedb_3d_pdb_codes.txt', header=None)[0].values.tolist()
pdb_id = pdb_codes[1]

egde_dist_threshold = 6.
embedding_function = partial(compute_esm_embedding, representation='residue',\
                 model_name="esm1b_t33_650M_UR50S", output_layer=33)


raw_df, header = read_pdb_to_dataframe(pdb_path=os.path.join(pdb_dir, pdb_id+'.pdb'))
tcr_raw_df, pmhc_raw_df = seperate_tcr_pmhc(raw_df, header['chain_key_dict'], include_b2m=False)

# TCR graph
tcr_g = build_residue_graph(tcr_raw_df, pdb_id, egde_dist_threshold=egde_dist_threshold)
tcr_g = compute_residue_embedding(tcr_g, embedding_function)

# pMHC graph
pmhc_g = build_residue_graph(pmhc_raw_df, pdb_id,  egde_dist_threshold=egde_dist_threshold)
pmhc_g = compute_residue_embedding(pmhc_g, embedding_function)

p1 = plotly_protein_structure_graph(
    tcr_g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="{} TCR alpha/beta chain Residue Graph".format(pdb_id),
    node_size_multiplier=1
    )

p2 = plotly_protein_structure_graph(
    pmhc_g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="{} pMHC chain Residue Graph".format(pdb_id),
    node_size_multiplier=1
    )

To use the Graphein submodule graphein.protein.features.sequence.embeddings, you need to install: biovec 
biovec cannot be installed via conda
To use the Graphein submodule graphein.protein.visualisation, you need to install: pytorch3d 
To do so, use the following command: conda install -c pytorch3d pytorch3d


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["node_id"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["residue_id"] = df["node_id"]
Using cache found in /Users/jgbrasier/.cache/torch/hub/facebookresearch_esm_main
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["node_id"] = (
A value is trying to be set on a copy of a slice from a

In [3]:
p1.show()
p2.show()

In [23]:
for n, d in pmhc_g.nodes(data=True):
    print(len(d['embedding']))
    break

1280


# TCR 3D

TCR3d: The T cell receptor structural repertoire database
Ragul Gowthaman, Brian G Pierce, TCR3d: The T cell receptor structural repertoire database, Bioinformatics, , btz517, https://doi.org/10.1093/bioinformatics/btz517

In [2]:
tcr3d_raw = pd.read_csv('../data/raw/tcr3d.csv')
print(tcr3d_raw.shape)

(225, 12)


In [6]:
# filter out class I
tcr3d_cols = ['pdb_id', 'tcr_name', 'mhc_name', 'species', 'epitope',
       'va', 'vb', 'docking_angle',
       'incident_angle',
       'buried_surface', 'shape_complimentarity',
       'affinity']

tcr3d_raw.columns = tcr3d_cols
tcr3d_raw.head()

Unnamed: 0,pdb_id,tcr_name,mhc_name,species,epitope,va,vb,docking_angle,incident_angle,buried_surface,shape_complimentarity,affinity
0,1AO7,A6,HLA-A*02:01,Human,LLFGYPVYV,TRAV12-2,TRBV6-5,33.5,11.0,1964.0,0.64,199
1,1BD2,B7,HLA-A*02:01,Human,LLFGYPVYV,TRAV29/DV5,TRBV6-5,48.5,11.8,1871.4,0.64,22
2,1FO0,BM3.3,H2-Kb,Mouse,INFDFNTI,TRAV16,TRBV1,41.3,12.4,1375.0,0.6,2
3,1G6R,2C,H2-Kb,Mouse,SIYRYYGL,TRAV9-4,TRBV13-2,22.5,8.5,1847.1,0.49,4
4,1KJ2,KB5-C20,H2-Kb,Mouse,KVITFIDL,TRAV14-1,TRBV1,30.6,6.1,1821.1,0.56,0


In [17]:
iedb_full_seq = pd.read_csv('../data/raw/tcell_receptor_table_export_1675206379.csv')
iedb_full_seq[['Chain 1 ID','Chain 1 Chain']] = iedb_full_seq['Chain 1 Accession'].str.split("_", expand=True)
iedb_full_seq[['Chain 2 ID','Chain 2 Chain']] = iedb_full_seq['Chain 2 Accession'].str.split("_", expand=True)

iedb_full_seq.head()



Columns (8,10,11,13,15,16,17,18,19,20,21,22,23,24,25,30,31,36,37,42,44,45,46,47,48,49,50,51,52,53,54,59,60,65,66) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Group Receptor ID,Receptor ID,Reference ID,Epitope ID,Description,Antigen,Organism,Response Type,Assay IDs,MHC Allele Names,...,Chain 2 CDR2 Curated,Chain 2 CDR2 Calculated,Chain 2 CDR2 Start Curated,Chain 2 CDR2 End Curated,Chain 2 CDR2 Start Calculated,Chain 2 CDR2 End Calculated,Chain 1 ID,Chain 1 Chain,Chain 2 ID,Chain 2 Chain
0,47,57,1004539,69921,VMAPRTLIL,"HLA class I histocompatibility antigen, Cw-3 a...",Homo sapiens (human),T cell,"1548960, 1583178","HLA-E*01:01, HLA-E*01:03",...,,FVKESK,,,47.0,52.0,2ESV,D,2ESV,E
1,47,57,1004539,69921,VMAPRTLIL,"HLA class I histocompatibility antigen, Cw-3 a...",Homo sapiens (human),T cell,1583178,HLA-E*01:03,...,,FVKESK,,,47.0,52.0,2ESV,D,2ESV,E
2,8493,58,1004580,16878,FLRGRAYGL,nuclear antigen EBNA-3,Human herpesvirus 4 (Epstein Barr virus),T cell,"1814845, 1814846, 1814847",HLA-B8,...,,FQNEAQ,,,47.0,52.0,1MI5,D,1MI5,E
3,8493,58,1004580,144889,FLRGRFYGL,,,T cell,1831737,HLA-B8,...,,FQNEAQ,,,47.0,52.0,1MI5,D,1MI5,E
4,8493,58,1017865,142137,EEYLQAFTY,ATP-binding cassette sub-family D member 3,Homo sapiens (human),T cell,1778798,HLA-B*44:05,...,,FQNEAQ,,,47.0,52.0,,,,


In [23]:
pdb_dir = '../data/pdb/tcr3d'
egde_dist_threshold = 6.
embedding_function = partial(compute_esm_embedding, representation='residue',\
                model_name="esm1b_t33_650M_UR50S", output_layer=33)

chain_key_dict = {'mhc': ['A', 'B'], 'tra':['D'], 'trb': ['E'], 'epitope': ['C']}

for i in range(len(tcr3d_raw.index)):
    pdb_id = tcr3d_raw['pdb_id'].iloc[i]
    print(tcr3d_raw.iloc[i])
    possible_tra = iedb_full_seq.loc[iedb_full_seq['Chain 1 ID']==pdb_id, 'Chain 1 CDR3 Calculated'].drop_duplicates()
    for tra in possible_tra:
        if tra in tcr_g.graph['sequence_'+chain_key_dict['tra'][0]]:
            print(tra)
    possible_trb = iedb_full_seq.loc[iedb_full_seq['Chain 2 ID']==pdb_id, 'Chain 2 CDR3 Calculated'].drop_duplicates()
    for trb in possible_trb:
        if trb in tcr_g.graph['sequence_'+chain_key_dict['trb'][0]]:
            print(trb)
    # raw_df, header = read_pdb_to_dataframe(pdb_path=os.path.join(pdb_dir, str(pdb_id).lower()+'.trunc.fit.pdb'), parse_header=False)
    # tcr_raw_df, pmhc_raw_df = seperate_tcr_pmhc(raw_df, chain_key_dict, include_b2m=False)

    # # TCR graph
    # tcr_g = build_residue_graph(tcr_raw_df, pdb_id, egde_dist_threshold=egde_dist_threshold)
    # tcr_g = compute_residue_embedding(tcr_g, embedding_function)

    # # pMHC graph
    # pmhc_g = build_residue_graph(pmhc_raw_df, pdb_id,  egde_dist_threshold=egde_dist_threshold)
    # pmhc_g = compute_residue_embedding(pmhc_g, embedding_function)

    break

pdb_id                          1AO7
tcr_name                          A6
mhc_name                 HLA-A*02:01
species                        Human
epitope                    LLFGYPVYV
va                          TRAV12-2
vb                           TRBV6-5
docking_angle                   33.5
incident_angle                  11.0
buried_surface                1964.0
shape_complimentarity           0.64
affinity                         199
Name: 0, dtype: object
AVTTDSWGKLQ
ASRPGLAGGRPEQY


In [14]:

tra = tcr_g.graph['sequence_'+chain_key_dict['tra'][0]]
tra

'EVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELIMSIYSNGDKEDGRFTAQLNKASQYVSLLIRDSQPSDSATYLCAVTTDSWGKLQFGAGTQVVVTP'

In [11]:
p1 = plotly_protein_structure_graph(
    tcr_g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="{} TCR alpha/beta chain Residue Graph".format(pdb_id),
    node_size_multiplier=1
    )

p2 = plotly_protein_structure_graph(
    pmhc_g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="{} pMHC chain Residue Graph".format(pdb_id),
    node_size_multiplier=1
    )

p1.show()
p2.show()