Playground notebook for using BeautifulSoup to parse ONT run .html report files

Uses
- [Beautiful Soup](https://realpython.com/beautiful-soup-web-scraper-python/#step-3-parse-html-code-with-beautiful-soup)
- re

In [1]:
from bs4 import BeautifulSoup as bs
import re

soup = bs(open("run_reports/report.html","r"),"html.parser")

In [None]:
# N50
q_n50 = soup.find_all("div", class_="basecalling-statistics")[0].text
p_n50 = re.compile('[0-9.]+ \Db')
s_n50 = p_n50.search(q_n50).group()

# Flow cell type, ID and kit
q_fc_kit = soup.find_all("div", class_="accordion content")[0].text
p_fc_id = re.compile("\D\D\D\d\d\d\d\d")

In [None]:
# run details
soup.find_all("div", class_="run-details")[0].text

# run id
soup.find_all("div", class_="protocol-run-id")[0].text

# reads pass Gb, reads fail Gb
soup.find_all("div", class_="container basecalling")[0].text



# est bases Gb, data prod Gb, Reads M, Estimated N50 kb
soup.find_all("div", class_="accordion content")[0].text

# fc type, fc id, kit type
soup.find_all("div", class_="accordion content")[2].text

# run length, active channel bool, pore scan freh hrs, bias init, bias final, reserved bool, basecalling method
soup.find_all("div", class_="accordion content")[3].text

# fast5 op, fast5 rpf, fastq op, etc ... , data location(!)
soup.find_all("div", class_="accordion content")[4].text

# software versions
soup.find_all("div", class_="accordion content")[5].text

In [None]:
def scrape_stats(soup, class_name, result_index = 0):
    '''Search bs4 object for HTML class occurence and return list of strings'''
    s = soup.find_all("div", class_=class_name)[result_index].text
    l = [e.strip() for e in s.split("\n") if e.strip()]
    return l

In [2]:

def list2dict_pairs(l):
    ''' [key1, val1, key2, val2, ... ] ---> {key1:val1, key2:val2, ... } '''
    assert len(l) % 2 == 0
    d = {}
    for i in range(0, len(l), 2):
        d[l[i]] = l[i+1]
    return d

def stats2dict(l):
    '''
    Disentangle list from scrape_stats() to dict.
    
    Input has to match format
    [HEADER_A, key1, val1, key2, val2, HEADER_B...]

    Output becomes
    --> {HEADER_A: {key1:val1, key2:val2}, HEADER_B...}
    '''

    p = re.compile("[A-Z ]{7,}")
    s = "\n".join(l)
    headers = p.findall(s)

    d = {}
    start_index = 0
    for header in headers[1:]:
        try:
            next_index = l.index(header)
        except IndexError:
            next_index = len(l)
        d[header] = list2dict_pairs(l[start_index+1:next_index])
        start_index = next_index
    
    return d

In [None]:
### FYI
total = []

# Headers followed by pairs
total += scrape_stats(soup, "accordion content", 0) # Remove "written to disk" and shuffle some lines
total += scrape_stats(soup, "accordion content", 1)

# Values w/o keys
total += scrape_stats(soup, "run-details")[0].split(" · ")

# Key : Value
total += scrape_stats(soup, "protocol-run-id")

In [1]:
# MANUALLY curate list
to_remove = ['DATA OUTPUT',
             'Data written to disk',
             'BASECALLING',
             'Pass',
             'Fail',
             'RUN DURATION',
             'RUN SETUP',
             'RUN SETTINGS',
             'DATA OUTPUT SETTINGS',
             'SOFTWARE VERSIONS']

for i in to_remove:
    total.remove(i)

# TODO q threshold
i = total.index('Bases called (min Q score: 9)')
total[i] = 'Bases pass (min Q score: 9)'
total.insert(i+2, 'Bases fail (min Q score: 9)')

## Curate based on whether key-value pairs in the list are 
# - on consecutive lines 
# - missing keys
# - concatenated
two_lines, missing_keys, one_line = total[:-5], total[-5:-1], total[-1]

data = {key : val for key, val in zip(two_lines[::2], two_lines[1::2])}

keys = ["Run duration",
        "Run folder",
        "Sample name",
        "Instrument position"]
for k, v in zip(keys, missing_keys):
    data[k] = v

data[one_line.split(": ")[0]] = one_line.split(": ")[1]