In [2]:
import os

from typing import Callable, Dict, Generator, List, Optional
import string
import re

import pandas as pd

from graphein.protein import plotly_protein_structure_graph
from src.dataset import TCRpMHCGraphDataset
from src.processing.pdb_graph import read_pdb_to_dataframe, seperate_tcr_pmhc, build_residue_graph

from graphein.protein.features.sequence.embeddings import esm_residue_embedding, compute_esm_embedding



In [5]:
data = pd.read_csv('../data/preprocessed/iedb_3d_binding_test.tsv', sep='\t')

embedding_dict = {'cdr3a': [], 'cdr3b': [], 'epitope': []}
for i in range(len(data.index)):
    cdr3a_emb = compute_esm_embedding(str(data.iloc[i]['cdr3a']), representation='sequence')
    cdr3b_emb = compute_esm_embedding(str(data.iloc[i]['cdr3b']), representation='sequence')
    epitope_emb = compute_esm_embedding(str(data.iloc[i]['epitope']), representation='sequence')

    embedding_dict['cdr3a'].append(cdr3a_emb)
    embedding_dict['cdr3b'].append(cdr3b_emb)
    embedding_dict['epitope'].append(epitope_emb)


embeddings = pd.DataFrame.from_dict(embedding_dict)



Using cache found in /Users/jgbrasier/.cache/torch/hub/facebookresearch_esm_main


In [6]:
embeddings

Unnamed: 0,cdr3a,cdr3b,epitope
0,"[0.13055126, 0.09437767, 0.009727462, 0.178527...","[0.07276406, 0.07469042, -0.040368844, 0.20737...","[0.112655014, -0.05528457, 0.13390647, 0.22884..."
1,"[0.13055126, 0.09437767, 0.009727462, 0.178527...","[0.07276406, 0.07469042, -0.040368844, 0.20737...","[0.112655014, -0.05528457, 0.13390647, 0.22884..."
2,"[0.13055126, 0.09437767, 0.009727462, 0.178527...","[0.07276406, 0.07469042, -0.040368844, 0.20737...","[0.112655014, -0.05528457, 0.13390647, 0.22884..."
3,"[0.052681178, 0.005951345, 0.041014038, 0.3284...","[-0.0044022035, 0.036872055, -0.03888404, 0.20...","[0.15952334, 0.057686474, 0.07444611, 0.130991..."
4,"[0.07079705, 0.035959765, 0.001471044, 0.21581...","[0.20185709, -0.053916536, 0.1284139, 0.185899...","[0.055020463, 0.054593235, 0.07720377, 0.24940..."
5,"[0.06979371, 0.06306952, 0.0101341205, 0.11267...","[0.22946438, -0.10129268, 0.12075286, 0.197249...","[0.0792308, 0.040893335, 0.103786156, 0.194984..."
6,"[0.06979371, 0.06306952, 0.0101341205, 0.11267...","[0.22946438, -0.10129268, 0.12075286, 0.197249...","[0.055020463, 0.054593235, 0.07720377, 0.24940..."
7,"[0.06979371, 0.06306952, 0.0101341205, 0.11267...","[0.22946438, -0.10129268, 0.12075286, 0.197249...","[0.014188701, -0.054996543, 0.10896827, 0.2391..."
8,"[0.073560014, 0.051811155, -0.10557967, 0.2081...","[0.11693764, 0.001072069, 0.13353771, 0.187752...","[0.11727484, 0.08372926, 0.033094916, 0.240599..."
9,"[0.01866818, 0.08724352, -0.029719347, 0.18073...","[0.0564062, 0.04437639, -0.012701153, 0.202727...","[0.11727484, 0.08372926, 0.033094916, 0.240599..."


In [74]:
import warnings
warnings.filterwarnings("ignore")
ignore = []

tsv_path = '../data/preprocessed/iedb_3d_binding_test.tsv'
pdb_dir = '../data/pdb/iedb_3d_resolved'
pt_save_dir = '../data/graphs/iedb_binding_test'
tcrpmhc_dataset = TCRpMHCGraphDataset(pdb_dir=pdb_dir, tsv_path=tsv_path)
tcrpmhc_dataset.process_bound_pdb(out_path=pt_save_dir, node_embedding_function=esm_residue_embedding, ignore=ignore)

100%|██████████| 60/60 [01:22<00:00,  1.37s/it]


In [56]:
datadir = '../data/pdb/iedb_3d_resolved'
pdb_code = '3MV8'
pdb_path = os.path.join(datadir, pdb_code+'.pdb')

data = pd.read_csv('../data/preprocessed/iedb_3d_binding_test.tsv', sep='\t')
epitope = data[data['id'] == pdb_code]['epitope'].str.lower().values
raw_df, header = read_pdb_to_dataframe(pdb_path=pdb_path)

print(header['chain_key_dict'])
tcr_raw_df, pmhc_raw_df = seperate_tcr_pmhc(raw_df, header['chain_key_dict'])

tcr_g = build_residue_graph(tcr_raw_df, pdb_code)
pmhc_g = build_residue_graph(pmhc_raw_df, pdb_code)

p = plotly_protein_structure_graph(
    tcr_g,
    colour_edges_by="kind",
    colour_nodes_by="seq_position",
    label_node_ids=False,
    plot_title="{} TCR alpha/beta chain Residue Graph".format(pdb_code),
    node_size_multiplier=1
    )
p.show()


ValueError: Header parsing error for key: tra in protein 3MV8