In [1]:
from pypdb import *
import json
import pandas as pd

#### You need to `pip install pypdb` if you want to try running this notebook. In case you need more info, their Git is "https://github.com/williamgilpin/pypdb"
#### This notebook is a demonstration for how to query with protein sequence in RCSB Protein Data Bank (PDB). I use PyPDB - a Python API to perform basic search on PDB. Searches using protein/DNA/RNA sequence (FASTA) will employ the MMseqs2 software and do a fast sequence matching searches (similar to BLAST). Only one sequence is accepted in the query entry. There are two possible return types for sequence query: 
- return_type = entry: This will return only PDB IDs.
- return_type = polymer_entity: This will return the first 10 results PDB IDs along with some parameters such as percentage identity, E-values, bitscores, sequence lengths, etc.  

#### Note: The result is based on sequence similarity, NOT structure similarity. Although Pfam also uses sequence similarity, Pfam and PDB use different database so results from PDB can be used to compare with Pfam results.

In [3]:
# return only PDB IDs if "entry" is specified for return_type. I confirmed the result by searching the same sequence on PDB site.
return_entry = Query("MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNHLYTHIDPAEQQDDLTAEVEVTDGEAFVLHPGEFVLASTLEVITLGDQLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHVTLELSNVANLPIKLWPGMKIGQLCIFRLSSPAEHPYGSAVYGSRYQGQRGPTPSRSAQNFRLWPTS", 
          query_type="sequence", 
          return_type="entry")
print(return_entry.search())

['2QXX', '4A6A', '2QLP', '1XS1', '2V9X', '1XS4', '1XS6', '2J4Q', '2J4H', '4XJC', '1OGH', '1PKH', '1PKJ', '1PKK', '2HXB', '3GF0', '2HXD', '3KM3', '4DHK', '1DUC', '1DUN', '4GK6']


In [14]:
# return first 10 results. The result is a JSON object which is similar to a Python dict. This contains more useful info so I will proceed with this result type.
return_polymer_entity = Query("MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNHLYTHIDPAEQQDDLTAEVEVTDGEAFVLHPGEFVLASTLEVITLGDQLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHVTLELSNVANLPIKLWPGMKIGQLCIFRLSSPAEHPYGSAVYGSRYQGQRGPTPSRSAQNFRLWPTS", 
          query_type="sequence", 
          return_type="polymer_entity")
print(return_polymer_entity.search())

{'query_id': '99444fc7-ac07-43f8-bae0-224e613fd972', 'result_type': 'polymer_entity', 'total_count': 22, 'result_set': [{'identifier': '2QXX_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 18501, 'original_score': 304.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.79, 'evalue': 9.01e-94, 'bitscore': 304, 'alignment_length': 186, 'mismatches': 39, 'gaps_opened': 0, 'query_beg': 1, 'query_end': 186, 'subject_beg': 1, 'subject_end': 186, 'query_length': 192, 'subject_length': 190, 'query_aligned_seq': 'MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNHLYTHIDPAEQQDDLTAEVEVTDGEAFVLHPGEFVLASTLEVITLGDQLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHVTLELSNVANLPIKLWPGMKIGQLCIFRLSSPAEHPYGSAVYGSRYQGQRGPTPSRSAQNF', 'subject_aligned_seq': 'MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNTRYTHIDPAKQQDELTSLVQPVDGEPFVLHPGEFVLGSTLELFTLPDNLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHITLELSNVANLPITLWPGMKIGQLCMLRLTSPSEHPYGSSRAGSKYQGQRGPTPSRSYQNF'}]}]}]}, {'identifier': '4A6A_1', 'score': 0

In [28]:
# Drop unnecessary information (result_type: polymer_entity, total_count) so that I can format it into Pandas DataFrame.
result_set = return_polymer_entity.search()['result_set']
print(result_set)

[{'identifier': '2QXX_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 23592, 'original_score': 304.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.79, 'evalue': 9.01e-94, 'bitscore': 304, 'alignment_length': 186, 'mismatches': 39, 'gaps_opened': 0, 'query_beg': 1, 'query_end': 186, 'subject_beg': 1, 'subject_end': 186, 'query_length': 192, 'subject_length': 190, 'query_aligned_seq': 'MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNHLYTHIDPAEQQDDLTAEVEVTDGEAFVLHPGEFVLASTLEVITLGDQLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHVTLELSNVANLPIKLWPGMKIGQLCIFRLSSPAEHPYGSAVYGSRYQGQRGPTPSRSAQNF', 'subject_aligned_seq': 'MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNTRYTHIDPAKQQDELTSLVQPVDGEPFVLHPGEFVLGSTLELFTLPDNLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHITLELSNVANLPITLWPGMKIGQLCMLRLTSPSEHPYGSSRAGSKYQGQRGPTPSRSYQNF'}]}]}]}, {'identifier': '4A6A_1', 'score': 0.9923076923076923, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 23592, 'original_score': 302.0, 'nor

In [9]:
# Try converting a short line of the result into Pandas DataFrame. More information in `services.node` (nested key) which can be exploded later. 
data = [{'identifier': '2QXX_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 32424, 'original_score': 304.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.79, 'evalue': 8.995e-94, 'bitscore': 304, 'alignment_length': 186, 'mismatches': 39, 'gaps_opened': 0, 'query_beg': 1, 'query_end': 186, 'subject_beg': 1, 'subject_end': 186, 'query_length': 192, 'subject_length': 190, 'query_aligned_seq': 'MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNHLYTHIDPAEQQDDLTAEVEVTDGEAFVLHPGEFVLASTLEVITLGDQLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHVTLELSNVANLPIKLWPGMKIGQLCIFRLSSPAEHPYGSAVYGSRYQGQRGPTPSRSAQNF', 'subject_aligned_seq': 'MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNTRYTHIDPAKQQDELTSLVQPVDGEPFVLHPGEFVLGSTLELFTLPDNLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHITLELSNVANLPITLWPGMKIGQLCMLRLTSPSEHPYGSSRAGSKYQGQRGPTPSRSYQNF'}]}]}]}]
df = pd.json_normalize(data, 'services', ['identifier', 'score'], record_prefix='services.')
df

Unnamed: 0,services.service_type,services.nodes,identifier,score
0,sequence,"[{'node_id': 32424, 'original_score': 304.0, '...",2QXX_1,1.0


In [11]:
# Try if result of two molecules is displayed correctly. Also extract the information in nested keys.
data2 = [{'identifier': '2QXX_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 32424, 'original_score': 304.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.79, 'evalue': 8.995e-94, 'bitscore': 304, 'alignment_length': 186, 'mismatches': 39, 'gaps_opened': 0, 'query_beg': 1, 'query_end': 186, 'subject_beg': 1, 'subject_end': 186, 'query_length': 192, 'subject_length': 190, 'query_aligned_seq': 'MLLSDRDLVSEIKSGDLSLEPF', 'subject_aligned_seq': 'MLLSDRDLRAEISSGRLGIDPFDDTL'}]}]}]}, {'identifier': '4A6A_1', 'score': 0.9923076923076923, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 32424, 'original_score': 302.0, 'norm_score': 0.9923076923076923, 'match_context': [{'sequence_identity': 0.784, 'evalue': 3.176e-93, 'bitscore': 302, 'alignment_length': 186, 'mismatches': 40, 'gaps_opened': 0, 'query_beg': 1, 'query_end': 186, 'subject_beg': 1, 'subject_end': 186, 'query_length': 192, 'subject_length': 190, 'query_aligned_seq': 'MLLSDRDLVSEIKSGDLSLEPFEP', 'subject_aligned_seq': 'MLLSDRDLRAEISSGRLGIDPFDDTLVQ'}]}]}]}]

df2 = pd.json_normalize(data2, ['services', 'nodes', 'match_context'], meta=['identifier', 'score', ['services', 'service_type']])
df2


Unnamed: 0,sequence_identity,evalue,bitscore,alignment_length,mismatches,gaps_opened,query_beg,query_end,subject_beg,subject_end,query_length,subject_length,query_aligned_seq,subject_aligned_seq,identifier,score,services.service_type
0,0.79,8.995e-94,304,186,39,0,1,186,1,186,192,190,MLLSDRDLVSEIKSGDLSLEPF,MLLSDRDLRAEISSGRLGIDPFDDTL,2QXX_1,1.0,sequence
1,0.784,3.1760000000000003e-93,302,186,40,0,1,186,1,186,192,190,MLLSDRDLVSEIKSGDLSLEPFEP,MLLSDRDLRAEISSGRLGIDPFDDTLVQ,4A6A_1,0.992308,sequence


In [12]:
# Sort all 10 molecules from the result into Pandas DataFrame. I confirmed that this matched the actual query using PDB.
df_result = pd.json_normalize(result_set, ['services', 'nodes', 'match_context'], meta=['identifier', 'score', ['services', 'service_type']])
df_result

Unnamed: 0,sequence_identity,evalue,bitscore,alignment_length,mismatches,gaps_opened,query_beg,query_end,subject_beg,subject_end,query_length,subject_length,query_aligned_seq,subject_aligned_seq,identifier,score,services.service_type
0,0.79,9.01e-94,304,186,39,0,1,186,1,186,192,190,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNT...,2QXX_1,1.0,sequence
1,0.784,3.1810000000000002e-93,302,186,40,0,1,186,1,186,192,190,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNT...,4A6A_1,0.992308,sequence
2,0.788,1.151e-79,263,161,34,0,1,161,1,161,192,161,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRAEISSGRLGIDPFDDTLVQPSSIDVRLDCLFRVFNNT...,2QLP_1,0.842308,sequence
3,0.431,1.372e-36,139,190,98,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,1XS1_1,0.365385,sequence
4,0.426,3.52e-36,138,190,99,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,2V9X_1,0.361538,sequence
5,0.426,9.025999999999999e-36,137,190,99,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,1XS4_1,0.357692,sequence
6,0.426,9.025999999999999e-36,137,190,99,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,1XS6_1,0.357692,sequence
7,0.426,9.025999999999999e-36,137,190,99,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,2J4Q_1,0.357692,sequence
8,0.426,3.168e-35,135,190,99,3,1,181,1,189,192,193,MLLSDRDLVSEIKSGDLSLEPFEPA-LLQPSSIDVRLDRFFRVFNN...,MRLCDRDIEAWLDEGRLSINPRPPVERINGATVDVRLGNKFRTFRG...,2J4H_1,0.35,sequence
9,0.338,6.595e-22,96,183,105,4,1,180,1,170,192,177,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MILSGKTISEKLTEKELEITPLTEEQIQPASVDLRLGPHFVTIDDS...,4XJC_1,0.2,sequence


In [13]:
# Order and keep the columns based on what we care most, also this can be modified anytime. I plan this to be the final output of this feature.
df_result_reind = df_result.reindex(columns=['identifier', 'score', 'sequence_identity', 'evalue', 'bitscore'])
df_result_reind

Unnamed: 0,identifier,score,sequence_identity,evalue,bitscore
0,2QXX_1,1.0,0.79,9.01e-94,304
1,4A6A_1,0.992308,0.784,3.1810000000000002e-93,302
2,2QLP_1,0.842308,0.788,1.151e-79,263
3,1XS1_1,0.365385,0.431,1.372e-36,139
4,2V9X_1,0.361538,0.426,3.52e-36,138
5,1XS4_1,0.357692,0.426,9.025999999999999e-36,137
6,1XS6_1,0.357692,0.426,9.025999999999999e-36,137
7,2J4Q_1,0.357692,0.426,9.025999999999999e-36,137
8,2J4H_1,0.35,0.426,3.168e-35,135
9,4XJC_1,0.2,0.338,6.595e-22,96


#### Below contains potential tests for this feature. Complete functions for tests will be done later.

In [43]:
# Test if returned result from query is a Python dict. If not, an AssertionError message "Unexpected type blah blah" is showed 
assert type(return_polymer_entity.search()) == dict, 'Unexpected type returned by search() method.'

In [44]:
# Test if unncessary keys are dropped from the returned result, and now I should have a list.
assert type(result_set) == list, 'Unexpected type returned'
if 'query_id' in result_set:
    raise ValueError("Unexpected 'query_id' key found in result")
if 'result_type' in result_set:
    raise ValueError("Unexpected 'result_type' key found in result")
if 'total_count' in result_set:
    raise ValueError("Unexpected 'total_count' key found in result")


#### Please don't mind the below. I was trying multiple things.

In [13]:
df_result2 = pd.json_normalize(result_set, ['services', 'nodes'], meta=['identifier', 'score', ['services', 'service_type']])
df_result2

Unnamed: 0,node_id,original_score,norm_score,match_context,identifier,score,services.service_type
0,29219,304.0,1.0,"[{'sequence_identity': 0.79, 'evalue': 9.01e-9...",2QXX_1,1.0,sequence
1,29219,302.0,0.992308,"[{'sequence_identity': 0.784, 'evalue': 3.181e...",4A6A_1,0.992308,sequence
2,29219,263.0,0.842308,"[{'sequence_identity': 0.788, 'evalue': 1.151e...",2QLP_1,0.842308,sequence
3,29219,139.0,0.365385,"[{'sequence_identity': 0.431, 'evalue': 1.372e...",1XS1_1,0.365385,sequence
4,29219,138.0,0.361538,"[{'sequence_identity': 0.426, 'evalue': 3.52e-...",2V9X_1,0.361538,sequence
5,29219,137.0,0.357692,"[{'sequence_identity': 0.426, 'evalue': 9.026e...",1XS4_1,0.357692,sequence
6,29219,137.0,0.357692,"[{'sequence_identity': 0.426, 'evalue': 9.026e...",1XS6_1,0.357692,sequence
7,29219,137.0,0.357692,"[{'sequence_identity': 0.426, 'evalue': 9.026e...",2J4Q_1,0.357692,sequence
8,29219,135.0,0.35,"[{'sequence_identity': 0.426, 'evalue': 3.168e...",2J4H_1,0.35,sequence
9,29219,96.0,0.2,"[{'sequence_identity': 0.338, 'evalue': 6.595e...",4XJC_1,0.2,sequence


- Create a component for the PDB query. Where should I put it? Create a different component
- Check for motif and domains