In [8]:
# Jupyter Notebook Scratch Work for PyBLAST Protein Search
import requests
import xml.etree.ElementTree as ET

In [9]:
PROTEIN = "Q5YCV9"  # Note: Q5CYV9 = General transcription and DNA repair factor IIH helicase subunit XPD
PROTEIN_XML = PROTEIN + ".xml"

In [10]:
# URL for specific protein
url = "https://www.uniprot.org/uniprot/" + PROTEIN_XML  # Protein Sequence = Q5YCV9

resp = requests.get(url)

# Save the xml file:
with open(PROTEIN_XML, "wb") as f:
    f.write(resp.content)

In [11]:
# Create element tree object
tree = ET.parse(PROTEIN_XML)
tree

<xml.etree.ElementTree.ElementTree at 0x7fbaa3f2a810>

In [12]:
# Get root element
root = tree.getroot()
root

<Element '{http://uniprot.org/uniprot}uniprot' at 0x7fbaa3f580b0>

In [22]:
root.tag

'{http://uniprot.org/uniprot}uniprot'

In [24]:
root.attrib

{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd'}

In [25]:
# Get children of root
for child in root:
    print(child.tag, child.attrib)

{http://uniprot.org/uniprot}entry {'dataset': 'Swiss-Prot', 'created': '2005-02-15', 'modified': '2021-06-02', 'version': '85'}
{http://uniprot.org/uniprot}copyright {}


In [26]:
# Get full tree structure:
[elem.tag for elem in root.iter()]

['{http://uniprot.org/uniprot}uniprot',
 '{http://uniprot.org/uniprot}entry',
 '{http://uniprot.org/uniprot}accession',
 '{http://uniprot.org/uniprot}name',
 '{http://uniprot.org/uniprot}protein',
 '{http://uniprot.org/uniprot}recommendedName',
 '{http://uniprot.org/uniprot}fullName',
 '{http://uniprot.org/uniprot}gene',
 '{http://uniprot.org/uniprot}name',
 '{http://uniprot.org/uniprot}organism',
 '{http://uniprot.org/uniprot}name',
 '{http://uniprot.org/uniprot}name',
 '{http://uniprot.org/uniprot}name',
 '{http://uniprot.org/uniprot}dbReference',
 '{http://uniprot.org/uniprot}lineage',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/uniprot}taxon',
 '{http://uniprot.org/un

In [41]:
# Get Structure
for elem in root.iter('{http://uniprot.org/uniprot}property'):
    print(elem.attrib)

{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'value': 'AAS91775.2'}
{'type': 'status', 'value': 'JOINED'}
{'type': 'molecule type', 'value': 'Genomic_DNA'}
{'type': 'protein sequence ID', 'valu

In [36]:
for elem in root.iter('{http://uniprot.org/uniprot}taxon'):
    print(elem.text)

Eukaryota
Metazoa
Chordata
Craniata
Vertebrata
Euteleostomi
Mammalia
Eutheria
Euarchontoglires
Primates
Haplorrhini
Catarrhini
Hylobatidae
Hylobates


In [37]:
# Add Everything to a Dictionary
data = {}
for elem in root.iter():
    for entry in root.iter(elem):
        if len(entry.attrib) > 0:
            





In [42]:
import json

In [44]:
import xmltodict

In [45]:
with open(PROTEIN_XML) as xml_file:
    data_dict = xmltodict.parse(xml_file.read())

In [46]:
data_dict

OrderedDict([('uniprot',
              OrderedDict([('@xmlns', 'http://uniprot.org/uniprot'),
                           ('@xmlns:xsi',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('@xsi:schemaLocation',
                            'http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd'),
                           ('entry',
                            OrderedDict([('@dataset', 'Swiss-Prot'),
                                         ('@created', '2005-02-15'),
                                         ('@modified', '2021-06-02'),
                                         ('@version', '85'),
                                         ('accession', 'Q5YCV9'),
                                         ('name', 'TAU_HYLLA'),
                                         ('protein',
                                          OrderedDict([('recommendedName',
                                                        OrderedDict(

In [49]:
json_data = json.dumps(data_dict)
json_data

'{"uniprot": {"@xmlns": "http://uniprot.org/uniprot", "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "@xsi:schemaLocation": "http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd", "entry": {"@dataset": "Swiss-Prot", "@created": "2005-02-15", "@modified": "2021-06-02", "@version": "85", "accession": "Q5YCV9", "name": "TAU_HYLLA", "protein": {"recommendedName": {"fullName": "Microtubule-associated protein tau"}}, "gene": {"name": {"@type": "primary", "#text": "MAPT"}}, "organism": {"name": [{"@type": "scientific", "#text": "Hylobates lar"}, {"@type": "common", "#text": "Common gibbon"}, {"@type": "synonym", "#text": "White-handed gibbon"}], "dbReference": {"@type": "NCBI Taxonomy", "@id": "9580"}, "lineage": {"taxon": ["Eukaryota", "Metazoa", "Chordata", "Craniata", "Vertebrata", "Euteleostomi", "Mammalia", "Eutheria", "Euarchontoglires", "Primates", "Haplorrhini", "Catarrhini", "Hylobatidae", "Hylobates"]}}, "reference": {"@key": "1", "citation": {"@typ

In [50]:
with open("data.json", "w") as json_file:
    json_file.write(json_data)