In [24]:
import os
import random
import pandas as pd

# MIT Restaurant

In [25]:
train_path = "../raw/restauranttrain.bio"
test_path = "../raw/restauranttest.bio"

with open(train_path,'r') as reader:
    train = reader.read().strip().split('\n\n')

with open(test_path,'r') as reader:
    test = reader.read().strip().split('\n\n')

In [26]:
train[5:7]

['O\ta\nO\tplace\nO\tthat\nO\tserves\nB-Dish\tsoft\nI-Dish\tserve\nI-Dish\tice\nI-Dish\tcream',
 'O\ta\nO\trestaurant\nO\tthat\nO\tis\nB-Rating\tgood\nB-Amenity\tfor\nI-Amenity\tgroups']

In [27]:
def preprocess_line(line):
    splitted_line = line.split('\n')
    text = []
    named_entity = {}
    prev_bio_tag = None
    current_named_entity = []
    for bio_tag_token in splitted_line:
        bio_tag, token = bio_tag_token.split('\t')
        text.append(token)

        # 1. if bio tag is B
        #   1.1. prev tag is None
        #   1.2. prev tag is O
        #   1.3. prev tag is B
        #       1.3.1. prev tag label is same
        #       1.3.2. prev tag label is different
        #   1.4. prev tag is I
        #       1.4.1. prev tag label is same
        #       1.4.2. prev tag label is different
        # 2. if bio tag is I
        #   2.1. prev tag is None
        #   2.2. prev tag is O
        #   2.3. prev tag is B
        #       2.3.1. prev tag label is same
        #       2.3.2. prev tag label is different
        #   2.4. prev tag is I
        #       2.4.1. prev tag label is same
        #       2.4.2. prev tag label is different
        # 3. if bio tag is O
        #   3.1. prev tag is None
        #   3.2. prev tag is O
        #   3.3. prev tag is B
        #   3.4. prev tag is I
        if bio_tag.startswith("B-"):
            if prev_bio_tag == None:
                # Append token
                current_named_entity.append(token)
            elif prev_bio_tag == 'O':
                # Append token
                current_named_entity.append(token)
            elif prev_bio_tag.startswith("B-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
                else:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
            elif prev_bio_tag.startswith("I-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
                else:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
        elif bio_tag.startswith("I-"):
            if prev_bio_tag == None:
                raise Exception(f"I token cannot begin a named entity phrase | line : {line}")
            elif prev_bio_tag == 'O':
                raise Exception(f"I token cannot begin a named entity phrase | line : {line}")
            elif prev_bio_tag.startswith("B-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    # Append token
                    current_named_entity.append(token)
                else:
                    raise Exception(f"I token cannot align with B token with different label | line : {line}")
            elif prev_bio_tag.startswith("I-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    # Append token
                    current_named_entity.append(token)
                else:
                    raise Exception(f"I token cannot align with I token with different label | line : {line}")
        elif bio_tag == 'O':
            if prev_bio_tag == None:
                pass
            elif prev_bio_tag == 'O':
                pass
            elif prev_bio_tag.startswith("B-"):
                prev_tag_label = prev_bio_tag[2:]
                if not prev_tag_label in named_entity.keys():
                    named_entity[prev_tag_label] = []
                named_entity[prev_tag_label].append(' '.join(current_named_entity))
                current_named_entity = []
            elif prev_bio_tag.startswith("I-"):
                prev_tag_label = prev_bio_tag[2:]
                if not prev_tag_label in named_entity.keys():
                    named_entity[prev_tag_label] = []
                named_entity[prev_tag_label].append(' '.join(current_named_entity))
                current_named_entity = []
        prev_bio_tag = bio_tag
    
    if prev_bio_tag != 'O': # last token
        prev_tag_label = prev_bio_tag[2:]
        if not prev_tag_label in named_entity.keys():
            named_entity[prev_tag_label] = []
        named_entity[prev_tag_label].append(' '.join(current_named_entity))
        current_named_entity = []
    
    return ' '.join(text), named_entity

In [28]:
train = [preprocess_line(line) for line in train]
test = [preprocess_line(line) for line in test]

In [29]:
train

[('2 start restaurants with inside dining',
  {'Rating': ['2 start'], 'Amenity': ['inside dining']}),
 ('34', {}),
 ('5 star resturants in my town',
  {'Rating': ['5 star'], 'Location': ['in my town']}),
 ('98 hong kong restaurant reasonable prices',
  {'Restaurant_Name': ['hong kong'], 'Price': ['reasonable']}),
 ('a great lunch spot but open till 2 a m passims kitchen',
  {'Hours': ['open till 2 a m'], 'Restaurant_Name': ['passims kitchen']}),
 ('a place that serves soft serve ice cream',
  {'Dish': ['soft serve ice cream']}),
 ('a restaurant that is good for groups',
  {'Rating': ['good'], 'Amenity': ['for groups']}),
 ('a salad would make my day', {'Dish': ['salad']}),
 ('a smoothie would hit the spot', {'Cuisine': ['smoothie']}),
 ('a steak would be nice', {'Dish': ['steak']}),
 ('a sub place near independence boulevard',
  {'Dish': ['sub'], 'Location': ['near independence boulevard']}),
 ('about how much is a midpriced bottle of good wine at davidos italian palace',
  {'Price': [

In [30]:
len(train)

7660

In [31]:
len(test)

1521

In [32]:
data = train + test

In [33]:
value_counts = {}
for el in data:
    text, named_entity = el
    for k,v in named_entity.items():
        if k not in value_counts.keys():
            value_counts[k] = 0
        value_counts[k] += len(v)

In [34]:
value_counts

{'Rating': 1271,
 'Amenity': 3074,
 'Location': 4629,
 'Restaurant_Name': 2303,
 'Price': 901,
 'Hours': 1202,
 'Dish': 1763,
 'Cuisine': 3371}

In [44]:
data_frame = []
for el in data:
    text, named_entity = el
    if len(named_entity) > 0:
        chosen_named_entity = max(named_entity,key=lambda x : value_counts[x])
        output = named_entity[chosen_named_entity]
    else:
        chosen_named_entity = random.choice(list(value_counts.keys()))
        output = "NONE"
    chosen_named_entity = chosen_named_entity.replace('_',' ')
    prompt = f"Extract all the {chosen_named_entity} entity in the text"
    if output != "NONE":
        output = ' , '.join(output)
    data_frame.append({
        "prompt" : prompt,
        "text" : text,
        "output" : output,
        "named_entity" : chosen_named_entity
    })
data_frame = pd.DataFrame(data_frame)

In [45]:
data_frame

Unnamed: 0,prompt,text,output,named_entity
0,Extract all the Amenity entity in the text,2 start restaurants with inside dining,inside dining,Amenity
1,Extract all the Price entity in the text,34,NONE,Price
2,Extract all the Location entity in the text,5 star resturants in my town,in my town,Location
3,Extract all the Restaurant Name entity in the ...,98 hong kong restaurant reasonable prices,hong kong,Restaurant Name
4,Extract all the Restaurant Name entity in the ...,a great lunch spot but open till 2 a m passims...,passims kitchen,Restaurant Name
...,...,...,...,...
9176,Extract all the Amenity entity in the text,will waffle house accept a prepaid visa gift card,prepaid visa gift card,Amenity
9177,Extract all the Location entity in the text,yes please get me mcdonalds phone number in pa...,patchogue new york,Location
9178,Extract all the Location entity in the text,yes the new diner on south street please,south street,Location
9179,Extract all the Restaurant Name entity in the ...,yes we need some chicken for our new diet so c...,chik fa lay,Restaurant Name


In [46]:
data_frame.named_entity.value_counts()

Location           4492
Cuisine            1547
Amenity            1188
Restaurant Name     908
Dish                709
Hours               134
Rating              118
Price                85
Name: named_entity, dtype: int64

In [47]:
data_frame = data_frame[["prompt","text","output"]]
data_frame.to_csv("../interim/restaurant.csv",index=False)