In [None]:
# I performed a search results for "green algae" in the NCBI Assembly database on August 6, 2020. 
# The results were exported and parsed and saved in a dataframe.

import xml.etree.ElementTree as ET
import pandas as pd

filename = "assembly_chloroplast_results.xml"
xmlTree = ET.parse(filename)

#Get the root element in the xml file.
rootElement = xmlTree.getroot()

# Parse file to extract relevant information

organism = []
assembly_type = []
assembly_status = []
coverage = []
submitter = []
contigN50 = []
scaffoldN50 = []
assembly_accession = []
submission_date = []
refseq_category = []

for element in rootElement.findall("DocumentSummary"):
    for child in element:
        if child.tag == 'Organism': organism.append(child.text)
        if child.tag == 'AssemblyType': assembly_type.append(child.text)
        if child.tag == 'AssemblyStatus': assembly_status.append(child.text)
        if child.tag == 'Coverage': coverage.append(child.text)
        if child.tag == 'SubmitterOrganization': submitter.append(child.text)
        if child.tag == 'Coverage': coverage.append(child.text)
        if child.tag == 'ContigN50': contigN50.append(child.text)
        if child.tag == 'ScaffoldN50': scaffoldN50.append(child.text)
        if child.tag == 'AssemblyAccession': assembly_accession.append(child.text)
        if child.tag == 'SubmissionDate': submission_date.append(child.text)
        if child.tag == 'RefSeq_category': refseq_category.append(child.text)
            
green_genome = pd.DataFrame(list(zip(assembly_accession, 
                                     organism, 
                                     submitter, 
                                     submission_date, 
                                     assembly_status, 
                                     assembly_type, 
                                     coverage, 
                                     contigN50, 
                                     scaffoldN50, 
                                     refseq_category)), 
                           columns =['AssemblyAccession',
                                     'Organism',
                                     'Submitter',
                                     'SubmissionDate',
                                     'AssemblyStatus',
                                     'AssemblyType',
                                     'Coverage',
                                     'ContigN50',
                                     'ScaffoldN50',
                                     'RefSeq_category'])

In [9]:
import pandas as pd

algae = pd.read_csv('ten_recent_genomes.csv', header=0)
algae[['Genome coverage','Total_seq_length']] = algae[['Genome coverage','Total_seq_length']].apply(pd.to_numeric, errors="ignore")
algae['Genome size'] = algae['Total_seq_length']/1000000
algae[algae['Genome size'] < 2]

# View data
algae

Assembly name                  object
Organism name                  object
Isolate                        object
Taxid                          object
BioSample                      object
BioProject                     object
Submitter                      object
Date                           object
Assembly type                  object
Release type                   object
Assembly level                 object
Genome representation          object
WGS project                    object
Assembly method                object
Expected final version         object
Genome coverage               float64
Sequencing technology          object
RefSeq category                object
GenBank assembly accession     object
Total_seq_length                int64
Genome size                   float64
dtype: object

In [11]:
import plotly.express as px

fig = px.scatter(algae, x = 'Genome size', y = "Genome coverage", log_x=True, log_y=True, color="Assembly level",
                hover_data=["Assembly name","Date", "Sequencing technology"])
fig.show()