Playground notebook for using BeautifulSoup to parse ONT run .html report files

Uses
- [Beautiful Soup](https://realpython.com/beautiful-soup-web-scraper-python/#step-3-parse-html-code-with-beautiful-soup)
- re

In [1]:
from bs4 import BeautifulSoup as bs
import re
from pandas import DataFrame

soup = bs(open("run_reports/report.html","r"),"html.parser")

In [None]:
def scrape_stats(soup, class_name, result_index = 0):
    ''' Search bs4 object for HTML class occurence and return list of strings '''
    s = soup.find_all("div", class_=class_name)[result_index].text
    l = [e.strip() for e in s.split("\n") if e.strip()]
    return l

In [None]:
# Use scrape_stats to append a big list containing relevant lines

total = []

# Headers followed by pairs
total += scrape_stats(soup, "accordion content", 0) # Remove "written to disk" and shuffle some lines
total += scrape_stats(soup, "accordion content", 1)

# Values w/o keys
total += scrape_stats(soup, "run-details")[0].split(" · ")

# Key : Value
total += scrape_stats(soup, "protocol-run-id")

In [1]:
# MANUALLY curate list

# Remove lines that are headers or superfluent
to_remove = ['DATA OUTPUT',
             'Data written to disk',
             'BASECALLING',
             'Pass',
             'Fail',
             'RUN DURATION',
             'RUN SETUP',
             'RUN SETTINGS',
             'DATA OUTPUT SETTINGS',
             'SOFTWARE VERSIONS']

for i in to_remove:
    total.remove(i)

# Manually re-shuffle lines pertaining to Q score threshold and pass/fail
p = re.compile("min Q score\: [\d]+")
q_score = p.search("".join(total)).group()[13:]

i = total.index(f'Bases called (min Q score: {q_score})')
total[i] = 'Bases passed'
total.insert(i+2, 'Bases failed')
total.insert(i, "Q score")
total.insert(i+1, q_score)

## Curate based on whether key-value pairs in the list are 
two_lines, missing_keys, one_line = total[:-5], total[-5:-1], total[-1]

data = {key : val for key, val in zip(two_lines[::2], two_lines[1::2])}

keys = ["Run duration",
        "Experiment name",
        "Sample name",
        "Instrument position"]
for k, v in zip(keys, missing_keys):
    data[k] = v

data[one_line.split(": ")[0]] = one_line.split(": ")[1]

In [None]:
dd = {k:[v] for k, v in zip(data.keys(), data.values())}
df = DataFrame(data = dd)

df.set_index("Protocol run ID")