In [8]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
import pandas as pd


def preprocess(sent):
  # Preprocess the text by tokenising the words and adding a pos tag to each word
  sent = nltk.word_tokenize(sent)
  sent = nltk.pos_tag(sent)
  return sent


def menu_to_list(lines):
  # Covert every sentence into a list and with every word in the sentence to a seperate tuple with IOB tags
  tokenized_menu = []
  for line in lines:
    new_line = preprocess(line)
    pattern = '''Price:|<S><CD> {(<\$><CD>*|<CD>$|^<CD>$)}'''
    cp = nltk.RegexpParser(pattern)
    cs = cp.parse(new_line)
    iob_tagged = tree2conlltags(cs)
    tokenized_menu.append(iob_tagged)
  return tokenized_menu


def tuple_converter(m_list):
  # Convert the tuples in list to list format
  new_m_list = []
  for sub in m_list:
    new_l = []
    for tup in sub:
      try:
        new_l.append(list(tup))
      except:
        continue
    new_m_list.append(new_l)
  return new_m_list


def menu_labeller(unlabelled_menu):
  # Takes the messy list and converts it into readable list with the items and its entity label i.e. Dish or Price 
  menu_table = []
  for sub_lists in unlabelled_menu:
    try:
      dish_name = ""
      for i in range(len(sub_lists)):
        if sub_lists[i][2] == "B-Price":
          try:
            menu_table.append(["Price",sub_lists[i][0]+sub_lists[i+1][0]])
          except:
            menu_table.append(["Price",sub_lists[i][0]])
          if dish_name == "":
            continue
          else:
            menu_table.append(["Dish",dish_name.strip()])
            dish_name = ""
        elif sub_lists[i][2] == "I-Price":
          continue
        else:
          dish_name = dish_name + sub_lists[i][0] + ' '
      if dish_name == "":
        continue
      else:
        menu_table.append(["Dish",dish_name.strip()])
    except:
      continue
  return menu_table


def menu_cleaner(labelled_menu):
  # Takes a labelled menu and removes useless text in in each line e.g. blank spaces or just symbols
  remover = []

  for item in range(len(labelled_menu)):
    if labelled_menu[item][0] == "Dish" and len(labelled_menu[item][1]) < 3:
      remover.append(labelled_menu[item])
  for i in remover:
    labelled_menu.remove(i)
  return labelled_menu


def menu_categorizer(labelled_menu, adjuster):
  # Converts cleaned menu list to a dataframe
  # For the adjuster input only -1 or +1 try manually to see which one makes more sense or which one works
  total_menu = []
  for item in range(len(labelled_menu)):
    if labelled_menu[item][0] == "Price":
      total_menu.append([labelled_menu[item][1],labelled_menu[item + adjuster][1]])
  return pd.DataFrame(total_menu,columns=['$',"Dish"])


def menutxt_to_dataframe(menu_text_file, adjuster):
  # Converts text menu file to a dataframe
  # For the adjuster input only -1 or +1 try manually to see which one makes more sense or which one works
  para = menu_text_file.split('\n')
  list_with_IOB = menu_to_list(para)
  untupled_list = tuple_converter(list_with_IOB)
  menu_with_label = menu_labeller(untupled_list)
  clean_menu = menu_cleaner(menu_with_label)
  final = menu_categorizer(clean_menu,adjuster)
  return final


sentence = """SEMI-BUFFET
LUNCH
The below include the Ruffet
Re are
Pepperoni Pizza $119 8" Buffalo Mozzarella Fritta $119
get
not * deep fried mozzarella gorlie butalo sauce campana DOP
DOP > : a on
Buffalo Wings $119 Wild Mushroom and Asparagus Risotto $139
wings sweet potato fries parme. san homemade pesto
swan :
Shrimp Spaghetti AOP $149 Pan-roasted Barramundi $159
chopped shrimp garlic chill asparagus mashed potatoes asparagus tomato salsa
fresh italian parsley
U.S. Hanger Steak (6oz) $159 Boston Lobster Tail Burger $199
am a a an
tagliata style sauteed vegetables iceberg lettuce tomato fennel guacamole
u.s. french fries black pepper sauce parmesan honey mustard u.s. french fries $1
Salad-Dessert Buffet Only $109
All entrees include a Regular Coffee, Tea or Homemade iced Tea
Drinks Upgrade
+$10 Other Coffee, English Breakfast Tea, Earl Grey Tea, Juice or Soft Drink
+$29 Selected Beer or Selected House Red/White Wine a
Guest must order an entree per person For dine-in only All prices are in 10% service charge applies
Cannot be used in conjunction with other promotional offers
"""

menutxt_to_dataframe(sentence,1)



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/jupyter/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,$,Dish
0,$119,Pepperoni Pizza
1,$119,'' Buffalo Mozzarella Fritta
2,$119,Buffalo Wings
3,$139,Wild Mushroom and Asparagus Risotto
4,$149,Shrimp Spaghetti AOP
5,$159,Pan-roasted Barramundi
6,$159,U.S. Hanger Steak ( 6oz )
7,$199,Boston Lobster Tail Burger
8,$1,u.s. french fries black pepper sauce parmesan ...
9,$109,Salad-Dessert Buffet Only


In [9]:
sentence = """Saturday, Sunday: & Public Holiday
Them below entrees include them Salad-Dessert Buffet
12" Hawaiian Pizza $159. Wild Breakfast
smoked ham pineapple two eggs any style herbed sausage bacon ham
mozzarella tomato sauce grilled tomato sauteed mushroom baked beans
sauteed herbed potato toasted sourdough
Buffalo Wings $149 Eggs Royale
8 wings u.s. sweet potato fries poached smoked salmon
spinach english muffin hollandaise
Crabmeat Omelette $159 Black Truffle Prawn Spaghetti $179
shallot spring onion pepper asparagus cherry tomato
onion toasted sour dough
Italian Beef Burger (80z) Pan-roasted Salmon
(85+)
salami pickle lettuce cheddar tomato french red fries onion mashed potatoes mango salsa
Australian Rib-eyer Steak (802) $1993 Kids' Salad-Dessert Buffet
I
french fries pepper sauce For under years only
entrees include. a Regular Coffee, Team or Homemade Iced Teali
Drinks Upgrade
Other, Coffee, * English Breakfast Tea, Grey Tea, Juice or Drink
"""

print(menutxt_to_dataframe(sentence,1))

       $                               Dish
0   $159               12 '' Hawaiian Pizza
1   $149                      Buffalo Wings
2   $159                  Crabmeat Omelette
3   $179      Black Truffle Prawn Spaghetti
4  $1993  Australian Rib-eyer Steak ( 802 )


In [10]:
sentence = """
LUNCH MENU
(OYSTER
Dailys Jet Oysterm (3 pcs)
$68
STARTER
Salad
ORE
Soups of Thema Dayan
MAIN COURSE
Linguine with Shrimp, Mixed Bell! Pepper Cream Sauce
$118
Pan  Pacific Ocean Perch with Daily! Sides, Salsa Sauce
$128
Slow Cooked Pork Knuckle with Sauerkraut, Rosemary Garlic Creamy Sauce
$138
Grilled Lamb Chops with Daily Sides, House Gravys
$148
Charcoal Grilled Grain Beef Tenderloin with Daily Sides, Cafer Paris Butter
$168
DESSERT
Banoffee Pient
ORI
Dailys Special
Coffee
GLASS HOUSE WINE (+S48)
10% Service Charge

"""

print(menutxt_to_dataframe(sentence,1))

      $                                               Dish
0   $68                                            STARTER
1  $118  Pan Pacific Ocean Perch with Daily ! Sides , S...
2  $128  Slow Cooked Pork Knuckle with Sauerkraut , Ros...
3  $138  Grilled Lamb Chops with Daily Sides , House Gr...
4  $148  Charcoal Grilled Grain Beef Tenderloin with Da...
5  $168                                            DESSERT
