In [1]:
# importing the required libraries
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.tag import pos_tag
from nltk import RegexpParser
from nltk import Tree
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

# all the packages we need to download.
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [2]:
# A function to return the tags of a sentence. Tags such as noun verb or else. The tags are used to find patterns
def preprocess1(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent



In [3]:
# This is a function which reads in the unit file and convert all the data into list so we can use them to change
# the units of ingredient. To add more unit list we have to modify this function and read the additional file of unit
# as well.
# We are lemmatizing and cleaning the file data as well to synchronize it with our processing and so 
#  Mango and mango are not taken as separate ones.
def read_units():
    ml=[]
    with open("units_data/ml.txt") as f:
        ml =[wnl.lemmatize(re.sub("\n","",a).strip().lower()) for a in f.readlines()]
    
    gram = []
    with open("units_data/gram.txt") as f:
        gram = [wnl.lemmatize(re.sub("\n","",a).strip().lower()) for a in f.readlines()]

    return ml,gram

In [4]:
# There is a case is better understand by an example. We have an ingredient water whose unit we are forcing to be ml.
# Now in one of the ingredient water is coming as 1/2 cups. We simply force it to be 1/2 ml but when we have to add
# this with other ml of water like we might have 2 ml water, adding 1/2 and 2 is not possible in python as 1/2 is 
# consider a string. So we need to convert 1/2 into 0.5. so this function does that.
def divider(number):
    a_q = number.split("/")
    a1 = int(a_q[0])
    a2 = int(a_q[1])
    return a1/a2 


In [5]:
# This is the function which covnerts the ingredient unit according to the file we have created. This function recieves
# those list as shown by ml and gram parameter and they also takes the ingredient name, unit and quantity.
# The major functionality is to check the name and if the name lies in the list, it converts it. The issue was
#  occuring that if the ingredient is in l we should convert that to kg and not g so we need to check that as well
# and because of that we have the quantity with us as well.
def convert_unit(substring4,unit,quantity_product,ml,gram):
    for m in ml:
        if m in substring4 and len(m)>0 and unit !="l":
            try:
                if unit == "kg":
                    return quantity_product,"l"
                else:
                    return quantity_product/1000,"l"
            except:
                
                return divider(quantity_product)/1000,"l"
    for g in gram:
        if g in substring4 and len(g)>0 and unit !="kg":
            try:
                if unit == "l":
                    return quantity_product,"kg"
                else:
                    return quantity_product/1000,"kg"
            except:
                return divider(quantity_product)/1000,"kg"
            
    return quantity_product,unit

In [6]:
# The pattern thats found in the ingredient listing is that words after a comma is mostly attributes of ingredient
# which is not required by us. so we remove that by spliting. This case fails few time and we recognize them by 
# pos tags. If certain tag exists around the , we don't split it. The 
def comma_separator(m):
    split_string1 = m.split(',')

    done = 0
    if "," in m:
        patterns1 = """P: {<NN><,><NN>}
                        {<VBD><,><VBD>}
                        {<JJ><,><JJ>}    
                        {<VBN><,><VBN>}"""  
        PChunker1 = RegexpParser(patterns1)
        output_pattern1 = PChunker1.parse(preprocess1(m))
        for child in output_pattern1:
            if isinstance(child, Tree):              
                if child.label() == 'P':

                    if (child[0][1] == "VBD") and len(split_string1)<3:
                        done=1
                        substring2 = split_string1[0] + "," +split_string1[1]
                    elif child[0][1] == "NN" or child[0][1] == "VBN" or child[0][1] == "JJ":
                        done=1
                        substring2 = split_string1[0] + "," +split_string1[1]
    if done == 0:
        substring2 = split_string1[0]     
    return substring2

In [7]:
# This is same as comma separator, we didn't require the text which comes after or except for few places which we 
# catch using pos tags pattern. There was some and/or in the text which needed to be converted so first we did that.
# in patterns you can see all the pos tag pattern on which we have to keep the both side of the or's.

def or_separator(substring2):
    substring2 = re.sub("and/or","or",substring2)
    substring2 = re.sub("or/and","or",substring2)
    split_string = substring2.split(' or ')
    substring1 = ""

    done = 0
    patterns = """P: {<JJ><CC><JJ>}
                     {<VBD><CC><VB>}
                     {<NN><CC><JJ>}
                     {<VBD><CC><JJ>}
                     {<CD><CC><CD>}""" 
    PChunker = RegexpParser(patterns)
    if " or " in substring2:
        found_pat = PChunker.parse(preprocess1(substring2))

        for child in found_pat:
            if isinstance(child, Tree):               
                if child.label() == 'P':
                    if child[1][0] == "or":
                        done=1

                        substring1 = split_string[1]

    if done ==0:
        substring1 = split_string[0]
        
    return substring1


In [15]:
# Main function that processes all and return the dataframe having all ingredients with there quantity and unit.
# it takes in the json data with the ml and gram unit lists as well. 
def process(input_json_data,ml,gram):

    data = input_json_data
    df = None
    des,quantity,unit = [],[],[]
    
#     The bigram words we want to remove from the text
    bigram = ["seasoned dry"]
#     THe one gram words we want to remove from the text
    unknow_words = ["assorted","free-range","hard","hard-cooked","added","no-salt",'chopped',"slices","pinch","large","medium","small","each","cooked","uncooked","cooled","pot", "pouch", "packed" ,"package", "regular", "roughly" ,"slice", "sliced", "strips"]
#     THe units we may encounter in our ingredients
    units = [' ml ', ' cup ',' can ',' cans ',' litres ',' litre ', ' cups '," liter "," liters ", " cm "," in ",' g ', ' oz ',' lb ', ' kg ',' kgs ', ' cm ', ' tbsp ', " ounces "," lbs ",' tsp ', ' l ']
#     The other small things and issues we need to remove which where no useful
    units_remove = ['%','/','.','-','_',']','*','-/', 'c.', 'à', 'thé','hot','pkg','plus','tbsp/','litres','litre','half','and','cubed','cube', '/-inch', "in","cm",',packages', 'pitted', 'plain' ,'thin' ,'ea','-inch' ,'inch' ,'cube', 'cubes','thinly','tiny','finely','[','ml', 'cup',"cups",'can','cans', "liter","-in","liters", 'g', 'oz', 'kg', 'cm','lb','-lb' ,'tbsp', 'kgs','tsp', 'l',"ea","ounces","lbs","peeled","roasted"]

#     looping over the data
    for result in data['data']:
#         Picking all the ingredients in a reciepe and then we loop over it as well
        d= result[u'ingredients'][u'ungrouped'][u'list']
        for a in d:
#             Storing the units
            actual_unit = a['unit']
#             Storing the ingredient
            m= a['description']

#             Removing extra . like at places it was 1 g. water but we need 1 g water so doing that
            m = re.sub("g\.","g",m)
            m = re.sub("\. "," ",m)
#             Tokenization means converting the sentence into words
            word_tokens = word_tokenize(m) 
#             Then re joining them to form a sentence again. The usefulness of this step is that it will add 
#             spaces with every word and punctuation
            m  = (" ").join(word_tokens)
            
#         calling the comma separator
            substring2 = comma_separator(m)
#     calling the or separator
            substring1 = or_separator(substring2)

#     Lowering the text so English and english becomes same and not different
            substring1 = substring1.lower()
    
#     This portion is for deciding the unit of ingredient.it has multiple steps to check the units
#     it checks if any of the unit exists in the ingredient from the list we created above. The last unit is 
#     preferred over everybody else. If the unit within string and in attributes matches that unit is finalized
#     without looking for other units
#     if no unit was found the attribute unit is assigned and even if that is empty x is assigned to it
            index = 0
            u = "x"
            for x in units:
                finding = max(substring1.find(x),0)
                if finding > index:
                  index =finding
                  u = x
                  if actual_unit != None and u.strip() == actual_unit.strip():
                      break
                    
            if u == "x" and a['unit'] != None:
                u = a['unit']
            

# From here the quantity extraction process starts. We want the quantity that comes before our selected unit.
# To do that we do the pattern match and pick the number before the unit.
# there are different ways how a number can be listed so we need to check pattern for all.
#  1/2 , 0/5,4 are different styles of quantity.

# After checking the pattern we extract that quantity and is stored. If no quantity is found we go to the quantity
# attribute
            s = substring1
            pattern1 = r"(\d+){}".format(u)
            pattern2 = r"(\d+/\d+){}".format(u)
            pattern3 = r"(\d+.\d+){}".format(u)

            if len(re.findall(pattern2, s))>0:
                quantity_product = re.findall(pattern2, s)[0]

            elif len(re.findall(pattern1, s))>0:
                if len(re.findall(pattern3, s))>0:
                    quantity_product = re.findall(pattern3, s)[0]

                else:
                    quantity_product = re.findall(pattern1, s)[0]
            else:
                quantity_product = a['quantity']
                
            if index !=0:
                index = index + len(u.strip())+1
            

# removing the brackets and text inside brackets
            substring3 = substring1
            substring3 = re.sub("\(.*?\)","",substring3)
            
            
            index = [i for i in range(len(substring3)) if substring3.startswith("(", i)]

            if len(index) == 0:
                pass
            else:
                index = index[0]
                substring3 = substring3[:index]  


            index = [i for i in range(len(substring3)) if substring3.startswith(")", i)]

            if len(index) == 0:
                pass
            else:
                index = index[-1]
                substring3 = substring3[index+1:]      
            x = re.findall("\d", substring3)

# here we are removing all extra numbers inside an ingredient 
            for aaa in set(x):
                substring3 = re.sub(aaa,"",substring3)
            substring4 = substring3
            
#             tokenizing and removing all bigrams word according to the list we created above. We are also doing
# lemmatization which singularize the words
            word_tokens = word_tokenize(substring4) 
            filtered_sentence = [wnl.lemmatize(w) for w in word_tokens if not w in unknow_words+units_remove] 
            substring4  = (" ").join(filtered_sentence)


#             basic striping and cleaning
            substring4 = substring4.strip(",")
            substring4 = substring4.strip()
             
            u = u.strip()
            
#       now the portion for quantity conversion starts. We wanted to convert all the different variants of liters and
#      kgs to be one so we do that here. if its in ml or g we also divide it by 1000 to make it equal to kg
            try:
                quantity_product =float(quantity_product)
                if u == 'litres' or u == 'litre' or u == "liter" or u == "liters" or u == "ml" or u == "l":
                    if u == "ml":
                        quantity_product = quantity_product/1000
                    u="l"
                elif u == "kgs" or u == "kg" or u == "g":
                    if u == "g":
                        quantity_product = quantity_product/1000
                    u="kg"
            except:
                pass
                
# bigram word removal
            for bi in bigram:
                if bi in substring4:
                    substring4 = re.sub(bi,"",substring4)
            
#             converting the unit. first we synchronize the unit one step back and now we want to convert them.
#          we simply call the function we created with the list and it will get us the results
            output_converter = convert_unit(substring4,u,quantity_product,ml,gram)

            quantity_product,u = output_converter[0],output_converter[1]

            if "cut-up" in substring4:
                print(m)
                print(substring4)
#         now after everything is processing we convert them to form a dataframe and return the info for further process
            des.append(substring4)
            quantity.append(quantity_product)
            unit.append(u)
            df =pd.DataFrame(zip(des,quantity,unit),columns=["food","quantity","unit"])

    return df

In [16]:
# the function adds spaces before and after a word. This was required due to certain issues.
# eg we want to find the category of rice which is listed in I think pantry. We also have ice listed in chilled. 
# now when we try to match ice with rice it give 100% match as ice is there in rice and we dont want that. so we added
# spaces this covnerted the "ice" into " ice " which is not present in "rice "
def add_spaces(text):
    return " "+text+" "

In [17]:
# reading all the category files cleaning them and creating there lists for checking
def create_category_list():
  bakery = []
  with open("categories_data/Bakery.txt") as f:
    bakery = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  chilled = []
  with open("categories_data/Chilled.txt") as f:
    chilled = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  beverages = []
  with open("categories_data/Beverages.txt") as f:
    beverages = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  dairy = []
  with open("categories_data/Dairy_Eggs.txt") as f:
    dairy = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  fruit = []
  with open("categories_data/Fruit_Vegetables.txt") as f:
    fruit = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  grains = []
  with open("categories_data/Grains_Beans.txt") as f:
    grains = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  herbs = []
  with open("categories_data/Herbs_Spices.txt") as f:
    herbs = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  meat = []
  with open("categories_data/Meat_Fish _Alternatives.txt") as f:
    meat = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]
    
  nuts = []
  with open("categories_data/Nuts_Seeds.txt") as f:
    nuts =[add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]
    
  pantry = []
  with open("categories_data/Pantry.txt") as f:
    pantry = [add_spaces(wnl.lemmatize(re.sub("\n","",a).strip().lower())) for a in f.readlines()]

  return bakery,chilled,beverages, dairy, fruit,grains,herbs,meat,nuts,pantry

In [18]:
# this function takes in all the list and the ingredient name and return its category
def name_cat(name,bakery,chilled,beverages, dairy, fruit,grains,herbs,meat,nuts,pantry):
  for a in bakery:
    if a in name and len(a)>0:
      return 0

  for a in chilled:
    if a in name and len(a)>0:
      return 1

  for a in beverages:
    if a in name and len(a)>0:
      return 2

  for a in dairy:
    if a in name and len(a)>0:
      return 3

  for a in fruit:
    if a in name and len(a)>0:
      return 4

  for a in grains:
    if a in name and len(a)>0:
      return 5

  for a in herbs:
    if a in name and len(a)>0:
      return 6

  for a in meat:
    if a in name and len(a)>0:
      return 7

  for a in nuts:
    if a in name and len(a)>0:
      return 8

  for a in pantry:
    if a in name and len(a)>0:
      return 9

  return 10

In [19]:
# the main function that calls all the functions and synchronize everything. It only takes in the data
def main_function(input_json):
    
#     Calling the function which reads the unit files
    ml,gram = read_units()
#     doing the main process and getting the dataframe
    df = process(input_json,ml,gram)
#     grouping them as to calculate the total quantity of each unique product
    df1 = df.groupby(["food","unit"])
#     reading all the unit files
    bakery,chilled,beverages, dairy, fruit,grains,herbs,meat,nuts,pantry = create_category_list()
    
#     creating lists to store the ingredients under each category
    json_bakery = []
    json_chilled = []
    json_beverages=[] 
    json_dairy =[] 
    json_fruit=[]
    json_grains =[] 
    json_herbs = []
    json_meat =[]
    json_nuts =[]
    json_pantry =[]
    json_others = []
#     looping over the grouped dataframe to clean and store it.
    for name,group in df1:
        new = {}
        new["ingredient"]=name[0].strip()
#         this logic supports the idea that if we have quantity less then 1 l it should be in ml. so this code snippet
# does that with kg and l. it convertes the quantity and unit if less then 1. It then round it to 2 digits. 
# it also supports the idea that if we have a quantity 2.99999, it should get 3 rather then 2.99.
        try:
            if name[1] == "l" and group["quantity"].sum()<1:
                new["unit"]="ml"
                new["quantity"]=round(float(group["quantity"].sum()*1000),2)
            elif name[1] =="kg" and group["quantity"].sum()<1:
                new["unit"]="g"
                new["quantity"]=round(float(group["quantity"].sum()*1000),2)
            else:
                new["unit"]=name[1]
                if round(group["quantity"].sum(),2) == int(group["quantity"].sum())+0.99:
                    new["quantity"] = round(group["quantity"].sum(),0)
                else:
                    new["quantity"]=round(group["quantity"].sum(),2)
        except:
#             we were having issues with quantity 1/2 so this code deal with that.
            new["unit"]=name[1]
            total = 0
            
            for qw in group["quantity"]:
                a = qw.split("/")
                
                a1 = int(a[0])
                a2 = int(a[1])
                total = total + a1/a2
                
            if round(total,2) == 0.33:
                quantity = "1/3"
            else:
                out = (total).as_integer_ratio()
                quantity = str(out[0])+"/"+str(out[1])
                if out[0] == out[1]:
                    quantity = out[0]
            
            new["quantity"] = quantity
# getting the category name by calling the function
        cat = name_cat(add_spaces(name[0]),bakery,chilled,beverages, dairy, fruit,grains,herbs,meat,nuts,pantry) 
        
#         according to category name adding the info in that list created above
        if cat == 0:
            json_bakery.append(new)
        elif cat == 1:
            json_chilled.append(new)
        elif cat == 2:
            json_beverages.append(new)
        elif cat == 3:
            json_dairy.append(new)
        elif cat == 4:
            json_fruit.append(new)
        elif cat == 5:
            json_grains.append(new)
        elif cat == 6:
            json_herbs.append(new)
        elif cat == 7:
            json_meat.append(new)
        elif cat == 8:
            json_nuts.append(new)
        elif cat == 9:
            json_pantry.append(new)
        elif cat == 10:
            json_others.append(new)


# sending the final output that is recieved by the front end. 
    final = {"Pantry":json_pantry,"Beverages":json_beverages,"Fruits and Vegetables":json_fruit,
             "Meat,Fish and Alternatives":json_meat,"Dairy and Eggs":json_dairy,
             "Chilled":json_chilled,"Grains and Beans":json_grains,"Herbs and Spices":json_herbs,
             "Nuts and Seeds":json_nuts,"Bakery":json_bakery,"You may also need":json_others}
    return final

    

In [20]:

import json
f = open('total.json')
data = json.load(f)
aa = main_function(data)

3 cups ( 750 mL ) cut-up , cooked or canned chicken
cut-up
2 cups ( 450 mL ) cut-up cooked chicken
cut-up chicken
1 cup ( 250 mL ) cut-up deli ham and/or cooked chicken
cut-up deli ham
2 cups ( 500 mL ) cut-up cooked rotisserie chicken
cut-up rotisserie chicken
2 cups ( 500 mL ) cut-up cooked chicken
cut-up chicken
1/3 cup ( 75 mL ) cut-up apple or grapes
cut-up apple
4 cups ( 1 L ) cut-up assorted fresh vegetables , ( bell peppers , mushrooms , zucchini and/or yellow squash )
cut-up fresh vegetable
2 cups ( 500 mL ) assorted cut-up fresh vegetables , ( broccoli florets , red bell pepper , zucchini and/or asparagus )
cut-up fresh vegetable
2 cups ( 500 mL ) rotisserie chicken or cut-up cooked chicken
cut-up chicken
1 cup ( 250 mL ) cut-up cooked chicken , cubed
cut-up chicken


In [14]:
with open("sample.json", "w",encoding="utf-8") as outfile: 
    json.dump(aa, outfile,indent=4,ensure_ascii=False)