In [1]:
import pandas as pd
from IPython.display import IFrame

In [2]:
# url = "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Tree&id=2&lvl=3&srchmode=1&keep=1&unlock"  # probably a more-official source, but I'm really not sure.
url = "https://lpsn.dsmz.de/archive/-classifphyla.html"

In [4]:
IFrame(url, width=1200, height=400)

In [5]:
import requests

In [6]:
response = requests.get(url)
html = response.text
print(html[:300])

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>
      Classification of bacteria - sponsored by Ribocon GmbH
    </title>
    <meta name="keyword


In [7]:
from bs4 import BeautifulSoup

In [8]:
soup = BeautifulSoup(html)
soup;  # now the html is in a searchable "soup" object! 

In [9]:
spans = soup.find_all('span')  # These are the entries!
spans[:10]

[<span class="domainlevel">Domain "Archaea"</span>,
 <span class="phylumlevel">Phylum "Crenarchaeota"</span>,
 <span class="classlevel">Class Thermoprotei</span>,
 <span class="orderlevel">Order Acidilobales</span>,
 <span class="familylevel">Family Acidilobaceae</span>,
 <span class="genuslevel">Acidilobus</span>,
 <span class="familylevel">Family Caldisphaeraceae</span>,
 <span class="genuslevel">Caldisphaera</span>,
 <span class="orderlevel">Order Cenarchaeales</span>,
 <span class="familylevel">Family "Cenarchaeaceae"</span>]

In [10]:
span_entries = []
for span in spans:
    # Reject spans that say "unassigned"
    if 'unassigned' in span.text:
        continue  
        
    # get the level and name from the class and text
    level = span['class'][0]
    name = span.text 
    span_entries.append((level, name))

span_entries[:15]

[('domainlevel', 'Domain "Archaea"'),
 ('phylumlevel', 'Phylum "Crenarchaeota"'),
 ('classlevel', 'Class Thermoprotei'),
 ('orderlevel', 'Order Acidilobales'),
 ('familylevel', 'Family Acidilobaceae'),
 ('genuslevel', 'Acidilobus'),
 ('familylevel', 'Family Caldisphaeraceae'),
 ('genuslevel', 'Caldisphaera'),
 ('orderlevel', 'Order Cenarchaeales'),
 ('familylevel', 'Family "Cenarchaeaceae"'),
 ('genuslevel', '"Cenarchaeum"'),
 ('orderlevel', 'Order Desulfurococcales'),
 ('familylevel', 'Family Desulfurococcaceae'),
 ('genuslevel', 'Aeropyrum'),
 ('genuslevel', 'Desulfurococcus')]

In [11]:
clean_entries = []
for level, name in span_entries:
    level_clean = level[:-5]  # take everything except the last 5 letters ("level")
    name_clean = name.split(" ")[-1]  # take only the last word
    name_clean = name_clean.replace('"', '')  # Remove quotation marks
    clean_entries.append((level_clean, name_clean))
    
clean_entries[:15]

[('domain', 'Archaea'),
 ('phylum', 'Crenarchaeota'),
 ('class', 'Thermoprotei'),
 ('order', 'Acidilobales'),
 ('family', 'Acidilobaceae'),
 ('genus', 'Acidilobus'),
 ('family', 'Caldisphaeraceae'),
 ('genus', 'Caldisphaera'),
 ('order', 'Cenarchaeales'),
 ('family', 'Cenarchaeaceae'),
 ('genus', 'Cenarchaeum'),
 ('order', 'Desulfurococcales'),
 ('family', 'Desulfurococcaceae'),
 ('genus', 'Aeropyrum'),
 ('genus', 'Desulfurococcus')]

In [12]:
levels = ['domain', 'phylum', 'class', 'subclass', 'order', 'suborder', 'family', 'genus']

entries = []  # the final list of entries
entry = {}   # the current dictionary being worked on and modified in the loop
for level, name in clean_entries:

    # Set all the properties lower than the current level to blank text
    for lev in levels[levels.index(level):]:
        entry[lev] = ''
        
    # Add current info to entry
    entry[level] = name
    
    # If it's a genus, record the full entry in list of entries
    if level == 'genus':  # only record genuses
        entries.append(entry.copy())
    

for e in entries[:15]:    
    print(e)

{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Thermoprotei', 'subclass': '', 'order': 'Acidilobales', 'suborder': '', 'family': 'Acidilobaceae', 'genus': 'Acidilobus'}
{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Thermoprotei', 'subclass': '', 'order': 'Acidilobales', 'suborder': '', 'family': 'Caldisphaeraceae', 'genus': 'Caldisphaera'}
{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Thermoprotei', 'subclass': '', 'order': 'Cenarchaeales', 'suborder': '', 'family': 'Cenarchaeaceae', 'genus': 'Cenarchaeum'}
{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Thermoprotei', 'subclass': '', 'order': 'Desulfurococcales', 'suborder': '', 'family': 'Desulfurococcaceae', 'genus': 'Aeropyrum'}
{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Thermoprotei', 'subclass': '', 'order': 'Desulfurococcales', 'suborder': '', 'family': 'Desulfurococcaceae', 'genus': 'Desulfurococcus'}
{'domain': 'Archaea', 'phylum': 'Crenarchaeota', 'class': 'Ther

In [13]:
df = pd.DataFrame(entries)
df[:15]

Unnamed: 0,domain,phylum,class,subclass,order,suborder,family,genus
0,Archaea,Crenarchaeota,Thermoprotei,,Acidilobales,,Acidilobaceae,Acidilobus
1,Archaea,Crenarchaeota,Thermoprotei,,Acidilobales,,Caldisphaeraceae,Caldisphaera
2,Archaea,Crenarchaeota,Thermoprotei,,Cenarchaeales,,Cenarchaeaceae,Cenarchaeum
3,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Aeropyrum
4,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Desulfurococcus
5,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Ignicoccus
6,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Ignisphaera
7,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Staphylothermus
8,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Stetteria
9,Archaea,Crenarchaeota,Thermoprotei,,Desulfurococcales,,Desulfurococcaceae,Sulfophobococcus


In [18]:
df.to_excel('Full_ Taxonomie.xlsx', index=False)
