In [1]:
import re
import pandas as pd
from cleaning import domain, filter_vvo_tags
from files import list_to_file, file_to_list
from regex import regex_filter, apply_regexs, generate_regex

In [2]:
def category_tags_to_file(terms_list, tag_search_domain, category_file_name):
    regexs = generate_regex(terms_list)
    tags = apply_regexs(regexs, tag_search_domain)
    list_to_file(category_file_name, tags)
    return tags

def print_stats(total, sub, description):
    print(f"{description} {round(100*sub/total, 2)}% or {sub}")
    
    
def tag_categorization_meassure(omniv_set, veg_set, vgn_set, vgn_veg_set, df, column):
    vegan = vegetarian = meat = vegan_and_vegetarian = empty = 0
    total = len(df)
    not_classified_ids = []
    
    for index, meal in df.iterrows():
        
        tag_list = meal[column]
        
        tag_set = set(tag_list)
        if bool(omniv_set & tag_set):
            meat += 1
            continue
            
        if bool(vgn_veg_set & tag_set):
            vegan_and_vegetarian += 1
            continue
            
        if bool(veg_set & tag_set):
            vegetarian += 1
            continue

        if bool(vgn_set & tag_set):
            vegan += 1
            continue
            
        if tag_list == []:
            empty += 1
            
        not_classified_ids.append(index)
        
    
    print(f"Total: {total}")
    print_stats(total, vegan, "Vegan:")
    print_stats(total, vegetarian, "Vegetarian: ")
    print_stats(total, vegan_and_vegetarian, "Vegan & Veg.:")
    print_stats(total, meat, "Meat:")
    print_stats(total, empty, "Empty:")
    print_stats(total, total-(empty+vegetarian+vegan+meat+vegan_and_vegetarian), "Could not be classified: ")

    not_classified = df.loc[not_classified_ids]
    
    return vegan, vegetarian, meat, not_classified



In [3]:
data = pd.read_csv("Data/SH_Canteens/meals_SH_canteens_merged_filtered_.csv", sep="@", infer_datetime_format=True, index_col=0) # german-canteens(filtered).csv
data.index.rename("index", inplace=True)
# evaluates tag_list as list (from string)
data["tags"] = list(map(eval, data["tags"]))
data["meal_name"] = list(map(lambda x : x.replace("\n", " ").lower(), data["meal_name"]))
data["tags"] = list(map(lambda x : [t.lower() for t in x], data["tags"]))

In [28]:
vgn_set = set(file_to_list("Data/tags/vgn_tags.txt"))
veg_set = set(file_to_list("Data/tags/veg_tags.txt"))
meat_set = set(file_to_list("Data/tags/meat_tags.txt"))
vgn_veg_set  = set(file_to_list("Data/tags/vgn_veg_tags.txt"))

#list_to_file("Data/tags/vgn_tags.txt", vgn_set)
#list_to_file("Data/tags/veg_tags.txt", veg_set)
#list_to_file("Data/tags/vgn_veg_tags.txt", vgn_veg_set)
#list_to_file("Data/tags/meat_tags.txt", meat_set)

In [31]:
_,_,_, nc_data = tag_categorization_meassure(vgn_set, veg_set, meat_set, vgn_veg_set, data, "tags")

Total: 700958
Vegan: 5.99% or 41972
Vegetarian:  50.73% or 355586
Vegan & Veg.: 1.16% or 8104
Meat: 15.44% or 108219
Empty: 14.99% or 105084
Could not be classified:  11.7% or 81993


In [34]:
nc_data = data

In [35]:
nc_names = nc_data["meal_name"]
print(len(nc_names))
print(len(set(nc_names)))

23271
4301


In [36]:
veg = apply_regexs([".*vegetarisch.*"], nc_names)
nc_names = set(nc_names)-set(veg)
print(len(veg))

vgn = apply_regexs([".*vegan.*"], nc_names)
print(len(vgn))
nc_names = set(nc_names)-set(vgn)

189
306


In [37]:
terms_meat = ["lachs", "rind", "meat", "beef", "poultry", "pork", "fish", "chicken", "hähnchen", "geflügel", "fisch" ,"krebs", "hack", "tier", "speck", "wild", "fleisch", "schwein", "shrimp", "lamm", "schinken"]

meat_regexs = generate_regex(terms_meat)
meat = apply_regexs(meat_regexs, nc_names )
meat +=  regex_filter(r"\bri\b", nc_names)
meat +=  regex_filter(r"\bsch\b", nc_names)
meat +=  regex_filter(r"\bkr\b", nc_names)
meat +=  regex_filter(r"\bwt\b", nc_names)

print(len(meat))

nc_names = set(nc_names)-set(meat)
meat

1622


['ocean burger - mit kross paniertem seelachs, (glw, fi, mi, sf, se) zitronendip (mi) und pommes frites',
 'seelachsfilet in kräutereihülle, dijon- senf- soße, karotten, kartoffelstampf',
 'seelachs, blattspinat, mozzarella, mandeln, tomaten-vollkornreis currysoße',
 'gebratener seelachs, karotten-kürbis-püree, lauch- mais- salat, sesamöl',
 'seelachsfilet , tomate, zucchini, kräuterkartoffeln, himbeerjoghurt',
 'seelachs im backteig remouladensoße kartoffelspalten',
 'seelachsfilet, kräuter- rührei, dijon- senf- soße, karotten, kräuterkartoffeln (ei, fi, glw, mi, sf)',
 'seelachs gebraten, karotten- kürbis- püree, sesam, kürbiskerne (fi, mi, se)',
 'seelachs im backteig (fi, glw, mi, sf) remouladensoße (3, 8, ei, glw, mi, sf, sl) salzkartoffeln',
 'seelachsfilet im backteig (fi, glw) erbsen-gurken-dip (3, ei, glw, sl) kartoffelspalten',
 'seelachsfilet in ei-hülle dillsoße salzkartoffeln eisbergsalat',
 'seelachsfilet, salbeipesto, (ei, fi, glw) tomaten- sahne [bio]- soße (mi, sl) gno

In [38]:
terms_veg = ["mozarella", "egg", "cheese", "milk", "yoghurt", "pana cotta", "schokolade", "rahmsauce", "honig", "hühnerei", "rahmsoße", "gouda", "vegetarisch", "milch", "eier", "käse", "gelatine", "joghurt", "quark", "schalenfrüchte", "zatziki", "zaziki", "butter"]

veg_regexs = generate_regex(terms_veg)
veg = apply_regexs(veg_regexs, nc_names )
veg += regex_filter(r"\bei\b", nc_names)
veg += regex_filter(r"\bml\b", nc_names)
veg += regex_filter(r"\bmi\b", nc_names)


nc_names = set(nc_names)-set(veg)
print(len(veg))
veg

1488


['italian burger - mozarella, eisberg , roten zwiebeln kartoffelspalten',
 'bio puten- cheeseburger ts - leider schon weggesnackt - war zu lecker.(glw, gld, mi, sf, sw)',
 'mac `n` cheese 3,10€',
 'cheese burger pommes frites',
 'cheese burger (1, glw, mi, se, sf, sl) pommes frites',
 'begrenztes angebot mac n cheese',
 'burger im menü mit fritz 0,33l cheese burger (1, glw, mi, se, sl) ... mit pommes frites',
 'mac n cheese (1, glw, mi, sf)',
 'mac and cheese',
 'tagesgericht: mac n cheese - gratin, blumenkohl, gouda, (1, glw, mi)',
 'cheese burger (1, glw, mi, se, sl)',
 'cheese burger (1, glw, mi, se, sf, sl)',
 'cheese burger kartoffelspalten',
 'tagesgericht cheese burger (1, glw, mi, se, sf, sl)',
 'tagesgericht: mac n cheese - gratin, blumenkohl, gouda (1, glw, mi)',
 'cheese burger "wie hausgemacht" (1, glw, mi, se, sl)',
 '"your favorites" mac n cheese (1, glw, mi, sf)',
 'mac n cheese',
 'cheese burger (ei, glw, glr, glg, glh, gld, mi, se, sf) tomaten -paprikasoße kartoffelspa

In [39]:
terms_vgn = ["vegan"]
vgn_heuristic = ["tofu", "reis", "soja", "asia"]

vgn_regexs = generate_regex(terms_vgn)
vgn += apply_regexs(vgn_regexs, nc_names)
nc_names = set(nc_names)-set(vgn)

vgn_heuristic_regexs = generate_regex(vgn_heuristic)
vgn += apply_regexs(vgn_heuristic_regexs, nc_names)
nc_names = set(nc_names)-set(vgn)


print(len(vgn))
vgn

731


['vegane brastwurst (sf) sauerkraut stampfkartoffeln (mi)',
 'veganer burger im menü mit fritz 0,33l falafel burger mit soja-paprika-dip, tomate, gurke, salat (glw, glh, gld, sf, so) ... mit pommes frites',
 'falafel bowl cous cous, tomate, gurke, rotkraut, eisbergsalat, mandeln, veganer dip',
 'veganissimo: rainbow pasta, kürbis, rote beete, oliven, kräuter, blumenkohl (6, 8, en, glw)',
 'veganer chrunchy chick`n burger, (glw, gld, sf, so) gemüsesticks & pommes fritiert',
 'sojagyros, vegane aioli ebly',
 'streng limitiert tomaten- oliven- peperoni- nudelauflauf mit veganem "käse"',
 'vegane ente orangensoße rotkohl kroketten',
 'planted bratwurst (sf) sauerkraut stampfkartoffeln vegan (1)',
 'jackfruit [bio] burger, vegane cocktailsoße (3, glw, gld, sf, so) gitterkartoffeln (glw)',
 'begrenztes angebot vegane bratwurst currysoße pommes frites',
 'bulgur [bio]-gemüsepfanne (glw, sf) veganer dip (1, 2, 4, 10, glw, sf, so)',
 'pasta bolognese vegan 3,50€',
 'vegane currywurst hot pot(t)

In [40]:
print(len(nc_names))
nc_names

1198


{'frikadellen bowl',
 'rote- linsen- suppe (sl)',
 'begrenztes angebot pasta bar nudeln jackfruitragout',
 'begrenztes angebot frischer stangenspargel hollandaise soße salzkartoffeln',
 'begrenztes angebot planted.bratwurst paprika- maissoße (sl) salzkartoffeln gemüse',
 'country- pfanne, falaffel, paprika kartoffel-rösti',
 'puten-piccata milanese cherry-tomatensoße röstkartoffeln',
 'grünkohleintopf mit kasseler (2, 8, 10, sf) baguettebrot (glw)',
 'currywurst hot pot(t) (2, 3, 10, sf, sl) kartoffelspalten (glw)',
 'tagliatelle, kürbis- ricotta- soße, walnüsse',
 'quinoa bowl, spinat-hummus, brokkoli, karotten, ingwer, casshewkerne (nc, se)',
 'italienischer nudelauflauf, champignons, mozzarella',
 'curryhuhnsuppe, mandeln baguettebrot',
 'quinoa [bio]- bowl (nc, se, sw) spinat-hummus, brokkoli, karotten, ingwer, casshewkerne',
 'gebratener seehecht,eihülle, balsamicolinsen, kartoffelpüree',
 'curry bowl',
 'grünkohleintopf, kochwurst (1, 2, 3, 4, 8, 10, sf, sl) baguettebrot (glw)',


In [42]:
vgn_veg = apply_regexs([".*vegan.*"], veg)
veg = set(veg)-set(vgn_veg)
print(len(vgn_veg))

meat_vgn_veg = apply_regexs([".*vegan.*", ".*vegetarisch.*", ".*vegetarian.*"], meat)
meat = set(meat)-set(meat_vgn_veg)
print(len(meat_vgn_veg))

0
0


In [43]:
list_to_file("Data/tags/1vgn_names.txt", vgn)
list_to_file("Data/tags/1veg_names.txt", veg)
list_to_file("Data/tags/1vgn_veg_names.txt", vgn_veg)
list_to_file("Data/tags/1meat_vgn_veg_names.txt", meat_vgn_veg)
list_to_file("Data/tags/1meat_names.txt", meat)