In [1]:
# Jupyter Notebook Scratch Work for PyBLAST Protein Search
import requests
import json
import xmltodict

In [2]:
# Define Protein and XML API search
PROTEIN = "Q5YCV9"  # Microtubule-associated protein tau
PROTEIN_XML = PROTEIN + ".xml"
PROTEIN_JSON = PROTEIN + ".json"

In [3]:
# Perform API Call for Protein and write data to XML file
url = "https://www.uniprot.org/uniprot/" + PROTEIN_XML  # Protein Sequence = Q5YCV9

resp = requests.get(url)

# Save the xml file:
with open(PROTEIN_XML, "wb") as f:
    f.write(resp.content)

In [4]:
# Read Protein XML file and convert to dictionary
with open(PROTEIN_XML) as xml_file:
    data_dict = xmltodict.parse(xml_file.read())

data_dict

OrderedDict([('uniprot',
              OrderedDict([('@xmlns', 'http://uniprot.org/uniprot'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd'),
                           ('entry',
                            OrderedDict([('@dataset', 'Swiss-Prot'),
                                         ('@created', '2005-02-15'),
                                         ('@modified', '2021-06-02'),
                                         ('@version', '85'),
                                         ('accession', 'Q5YCV9'),
                                         ('name', 'TAU_HYLLA'),
                                         ('protein',
                                          OrderedDict([('recommendedName',
                                                        OrderedDict(

In [5]:
# Write dictionary to json file
json_data = json.dumps(data_dict)

with open(PROTEIN_JSON, "w") as json_file:
    json_file.write(json_data)

In [9]:
# Read in JSON data into normal Python dictionary
with open(PROTEIN_JSON, "r") as json_file:
    protein_data = json.load(json_file)
    
protein_data

{'uniprot': {'@xmlns': 'http://uniprot.org/uniprot',
  '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
  '@xsi:schemaLocation': 'http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd',
  'entry': {'@dataset': 'Swiss-Prot',
   '@created': '2005-02-15',
   '@modified': '2021-06-02',
   '@version': '85',
   'accession': 'Q5YCV9',
   'name': 'TAU_HYLLA',
   'protein': {'recommendedName': {'fullName': 'Microtubule-associated protein tau'}},
   'gene': {'name': {'@type': 'primary', '#text': 'MAPT'}},
   'organism': {'name': [{'@type': 'scientific', '#text': 'Hylobates lar'},
     {'@type': 'common', '#text': 'Common gibbon'},
     {'@type': 'synonym', '#text': 'White-handed gibbon'}],
    'dbReference': {'@type': 'NCBI Taxonomy', '@id': '9580'},
    'lineage': {'taxon': ['Eukaryota',
      'Metazoa',
      'Chordata',
      'Craniata',
      'Vertebrata',
      'Euteleostomi',
      'Mammalia',
      'Eutheria',
      'Euarchontoglires',
      'Primates',
    

In [16]:
# Collect GO terms
db_ref = protein_data["uniprot"]["entry"]["dbReference"]
go_terms = []
for ref in db_ref:
    if ref["@type"] == "GO":
        go_terms.append(ref)

go_terms

[{'@type': 'GO',
  '@id': 'GO:0030424',
  'property': [{'@type': 'term', '@value': 'C:axon'},
   {'@type': 'evidence', '@value': 'ECO:0000250'},
   {'@type': 'project', '@value': 'UniProtKB'}]},
 {'@type': 'GO',
  '@id': 'GO:0005737',
  'property': [{'@type': 'term', '@value': 'C:cytoplasm'},
   {'@type': 'evidence', '@value': 'ECO:0000250'},
   {'@type': 'project', '@value': 'UniProtKB'}]},
 {'@type': 'GO',
  '@id': 'GO:0005829',
  'property': [{'@type': 'term', '@value': 'C:cytosol'},
   {'@type': 'evidence', '@value': 'ECO:0000501'},
   {'@type': 'project', '@value': 'UniProtKB-SubCell'}]},
 {'@type': 'GO',
  '@id': 'GO:0030425',
  'property': [{'@type': 'term', '@value': 'C:dendrite'},
   {'@type': 'evidence', '@value': 'ECO:0000250'},
   {'@type': 'project', '@value': 'UniProtKB'}]},
 {'@type': 'GO',
  '@id': 'GO:0030426',
  'property': [{'@type': 'term', '@value': 'C:growth cone'},
   {'@type': 'evidence', '@value': 'ECO:0000250'},
   {'@type': 'project', '@value': 'UniProtKB'}]}

In [17]:
# Get Protein Sequence
seq = protein_data["uniprot"]["entry"]["sequence"]
seq

{'@length': '776',
 '@mass': '81013',
 '@checksum': '3D000CC44E78B06D',
 '@modified': '2007-01-23',
 '@version': '4',
 '#text': 'MAEPRQEFDVMEDHAGTYGLGDRKDQGGYTMLQDQEGDTDAGLKESPLQTPAEDGSEEPGSETSDAKSTPTAEDVTAPLVDEGAPXKQAAAQPHTEIPEGTTAEEAGIGDTPSLEDEAAGHVTQEPESGKVVREGFLGEPGPRSXSHQLASGMPGAPLLPEGPREATRQPSGTGPEDTEGGRHAPELLKHQLLGDLHQEGPPLKRAGGKERPGIKEEVDEDRDVDESSPQDSPPSKVSPAHDGRPPQTAAREATSIPGFPAEGAIPLPVDFLSKVSTEIPASEPDGPSAGRAEGQDAPPEFTFHVEITPNVQKEQAHSEEHLGRAAFPGAPGEGPEAQGPSLGEDTKEADLPEPSEKQPAAAPRGKPISRVPQLKARMVSKSKDGTGSDDKKAKTSTRSSAKTLKNRPCLSPKHPTPGSSDPLIQPSSPAVCPEPPSSPKYVSSVTXRTGSSGAKEMKLKGADGKTKIATPRGAAPPGQKGQANATRIPAKTPPAPKTPPSSVTKQVQRRPPPAGPKSERGEPPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPTREPKKVAVVRTPPKSPSSAKSRLQTAPVPMPDLKNVKSKIGSTENLKHQPGGGKVQIINKKLDLSNVQSKCGSKDNIKHVPGGGSVQIVYKPVDLSKVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDRVQSKIGSLDNITHVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPVVSGDTSPRHLSNVSSTGSIDMVDSPQLATLADEVSASLAKQGL'}