In [11]:
import shutil
import os
import re
from pybeerxml.parser import Parser

In [9]:
from yeast_db import build_yeast_dicts
yeast_name_to_id, brand_to_ids, id_to_yeast_names = build_yeast_dicts()

In [12]:
parser = Parser()
valid_file_count = 0

def clean_replace(s, target):
    return s.replace(target, '').replace("  ", " ").strip()

for dirpath, dirnames, files in os.walk('./original_data'):    
    for file_name in files:
        file_path = str(os.path.join(dirpath, file_name))
        try:
            recipes = parser.parse(file_path)
        except:
            print("Erroneous file found, removing: " + file_path)
            os.remove(file_path)
            continue
        
        recipe_valid = False
        for recipe in recipes:
            for y in recipe.yeasts:
                if not isinstance(y.name, str):
                    # Try turning the name into a code (likely a wyeast strain)...
                    yeast_name = str(int(y.name))
                else:
                    yeast_name = y.name.lower()
                    # Some basic clean-up to start...
                    if "conan" in yeast_name: yeast_name = "vermont ale"
                    elif "super high gravity" in yeast_name or "wlp 099" in yeast_name: yeast_name = "super high gravity ale"
                    elif "super yeast" in yeast_name: yeast_name = "san diego super"
                    elif "chico" in yeast_name: yeast_name = "chico ale"
                    elif "orval" in yeast_name: yeast_name = "brettanomyces bruxellensis"
                    elif "duvel" in yeast_name: yeast_name = "belgian golden ale"
                    elif "dupont" in yeast_name: yeast_name = "french saison ale"
                    else:
                        # Clean up any spelling mistakes, ordering
                        yeast_name = yeast_name.replace("kรถlsch","kolsch").replace("kรถlsh","kolsch").replace("kölsh","kolsch")
                        yeast_name = yeast_name.replace("monastery","monastary").replace("monestary","monastary")
                        yeast_name = yeast_name.replace("california v ale", "california ale v")
                        yeast_name = yeast_name.replace("mã©lange","melange").replace("brettâ€™","brett") \
                            .replace("cã´te","cote").replace("munuch","munich").replace("lellemand","lallemand") \
                            .replace("champagene", "champagne").replace("vemont","vermont")

                if yeast_name not in yeast_name_to_id:
                    # First attempt: Try to find a yeast code to match
                    s = re.search(r"(inis\-|wlp|us\-|([kwst]|oyl|bry)\-|[mg]|\d+/)\s?\-?(\d+(/\d+)?)", yeast_name, flags=re.IGNORECASE)
                    if s != None and len(s.group()) > 0:
                        # We have a potential yeast product id...
                        product_id = s.group().replace("--","-").replace(" ","")
                        if product_id not in yeast_name_to_id:
                            pass
                            #print(f"Product id {product_id} not found for {yeast_name}, file: {file_path}")
                        else:
                            continue

                    # Try removing the word "yeast" (and watch out for any double spaces that may ensue)
                    if clean_replace(yeast_name, "yeast") in yeast_name_to_id:
                        continue
                    # Try replacing "yeast" with "ale" or "lager"
                    if yeast_name.replace("yeast", "ale") in yeast_name_to_id or yeast_name.replace("yeast", "lager") in yeast_name_to_id:
                        continue
                    # Try adding "ale" or "blend" or "lager" to the end
                    if yeast_name+" ale" in yeast_name_to_id or yeast_name+" lager" in yeast_name_to_id or yeast_name+" blend" in yeast_name_to_id:
                        continue

                    # Check for a wyeast code (4 digit code)
                    s = re.search(r"(\d{4})(\D+|$)", yeast_name, flags=re.IGNORECASE)
                    if s != None and len(s.group(1)) > 0:
                        product_id = s.group(1)
                        if product_id in yeast_name_to_id:
                            continue

                    # Try to match the brand name first and then try to figure out
                    # which yeast we're dealing with within that brand 
                    for brand_name, ids in brand_to_ids.items():
                        s = re.search(brand_name, yeast_name, flags=re.IGNORECASE)
                        if s != None and len(s.group()) > 0:
                            # Find the best possible match
                            best_match_count = 0
                            best_match_id = -1
                            for id in ids:
                                count = 0
                                potential_yeast_names = id_to_yeast_names[id]
                                for name in potential_yeast_names:
                                    s = re.search(name, yeast_name, flags=re.IGNORECASE)
                                    if s != None: count += 1
                                if count > best_match_count:
                                    best_match_count = count
                                    best_match_id = id
                            if best_match_id == -1:
                                # Just choose the first id...
                                assert len(ids) > 0
                                best_match_id = ids[0]
                                yeast_name = id_to_yeast_names[best_match_id][0]
                            else:
                                pass
                                #print(f"Found a brand-based match for '{yeast_name}', with brand '{brand_name}', and names: {id_to_yeast_names[best_match_id]}")

                            break


                    # Last attempt - try to match any of the ids directly with the yeast name string
                    found = False
                    best_len = 0
                    best_name = ""
                    for dict_name in yeast_name_to_id:
                        name_opts = f"({dict_name}|{clean_replace(dict_name, 'ale')}|{clean_replace(dict_name,'lager')})"
                        s = re.search(name_opts, yeast_name)
                        if s != None and len(s.group()) > 0:
                            group_len = len(s.group())
                            if best_len < group_len:
                                best_len = group_len
                                best_name = dict_name

                    if best_len == 0:
                        if yeast_name == '- -' or yeast_name == 'default - - -' or '?' in yeast_name or yeast_name == "deleteme":
                            print(f"Moving file with empty/unknown (e.g., '- -') yeast: {file_path}")
                            shutil.copy(file_path, f"./data/_not_matched/{file_name}")
                        else:
                            print(f"No matches for '{yeast_name}', file: {file_path}")
                        break
                    else:
                        recipe_valid = True
                        #print(f"Matched '{yeast_name}' with '{best_name}', file: {file_path}")
        if recipe_valid:
            valid_file_count += 1
            
print(f"Valid files found: {valid_file_count}")

Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/508856.xml
No matches for 'bock lager ', file: ./original_data/135/bt_22894.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/318856.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/386892.xml
No matches for 'bluestone bsy-l006 stuttgart larger yeast', file: ./original_data/135/366233.xml
No matches for '1/4 gallon yeast', file: ./original_data/135/9541.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/564316.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/567069.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/547811.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/334482.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/559190.xml
Deleting file with empty/unknown (e.g., '- -') yeast: ./original_data/135/291763.xml