This notebook aims to parse some raw html tables containing common food additive info into a useable JSON object that maps from the chemical name to something more understandable (eg. colour, sweetener).

In [71]:
import re
import pprint
import json

In [68]:
# clean up data
to_rm = ["<strong>", "</strong>", "<em>", "</em>", "&nbsp;"]
def clean(s: str) -> str:
    for x in to_rm:
        s = s.replace(x, "")
    s = s.replace("<br>", " ")
    s.strip()
    return s.lower()

def remove_number(s: str) -> str:
    s = s.split(" ")
    try:
        int(s[0][:-1])
        return " ".join(s[1:])
    except:
        return " ".join(s)
    
# we just want something short and simple; take the first part of the description
def take_first_item(s: str) -> str:
    s = s.split(", ")
    return s[0]

# given the column (index) representing keys and the column (index) representing the mapping,
# output an object
def parse(filename: str, from_col: str, to_col: str) -> object:
    with open(filename, "r") as file:
        x = file.read()
        
        # extract rows
        rows = re.findall("<tr>(.*?)<\/tr>", x)

        d = {}
        from_ind = None
        to_ind = None

        # extract cols
        for i, row in enumerate(rows):
            data = re.findall("<td>(.*?)<\/td>", row)
            if i == 0:
                for i in range(len(data)):
                    data[i] = data[i].replace("<strong>", "")
                    data[i] = data[i].replace("</strong>", "")
                    data[i] = data[i].replace("<br>", " ")
                from_ind = data.index(from_col)
                to_ind = data.index(to_col)
                continue
            if data[-1] == "&nbsp;":
                continue
            # print(data)
            chemical_name = clean(remove_number(data[from_ind]))
            chemical_simplified = clean(take_first_item(data[to_ind]))
            d[chemical_name] = chemical_simplified

        pprint.pprint(d)
        return d

In [69]:
# d = parse("raw_html_1.txt", "Functional Classes", "Technological Purpose")
# d = d | parse("raw_html_2.txt", "Functional Classes", "Technological Purpose")

In [70]:
d = parse("raw_html_3.txt", "Name of Food Additive", "Purpose")

{'acesulfame potassium': 'sweetener',
 'acetic acid, glacial': 'preservative',
 'acetic and fatty acid esters of glycerol': 'stabilizer',
 'acetone peroxide': 'flour treatment agent',
 'acetylated distarch adipate': 'thickener',
 'acetylated distarch phosphate': 'thickener',
 'acetylated oxidized starch': 'thickener',
 'acid-treated starch': 'thickener',
 'adipic acid': 'acidity regulator',
 'advantame': 'flavour enhancer',
 'agar': 'thickener',
 'alanine, dl-': 'flavour enhancer',
 'alginic acid': 'thickener',
 'alitame': 'sweetener',
 'alkaline treated starch': 'thickener',
 'alkanet': 'colour',
 'allura red ac': 'colour',
 'alpha-amylase from aspergillus oryzae var.': 'flour treatment agent',
 'alpha-amylase from bacillus megaterium expressed in bacillus subtilis': 'flour '
                                                                          'treatment '
                                                                          'agent',
 'alpha-amylase from bacillus stearothermo

In [None]:
# export
with open("chemicals.json", "w") as outfile: 
    json.dump(d, outfile)