In [11]:
from nltk.corpus import stopwords
from nltk import word_tokenize

stopwords = set(stopwords.words("english"))

In [12]:

from collections import Counter

f_request = open("data/WOZ_test_utt.txt",encoding='utf-8')
f_slots = open("data/WOZ_test_ans.txt",encoding='utf-8')
H_counter = Counter()
R_counter = Counter()
for line in f_request:
    slot = f_slots.readline()
    words = [word.lower() for word in word_tokenize(line.strip()) if word.lower() not in stopwords]
    if slot.startswith("find_hotel"):
        H_counter.update(words)
    else:
        R_counter.update(words)
                         
print(H_counter.most_common(60))
print(R_counter.most_common(60))

[('.', 154), ('hotel', 99), ('looking', 85), ('stay', 80), ('?', 68), ('place', 65), ('free', 63), (',', 57), ("'m", 52), ('find', 49), ('help', 38), ('parking', 34), ('need', 33), ('wifi', 31), ('hi', 28), ('guesthouse', 27), ('cambridge', 24), ('house', 22), ('star', 21), ('like', 19), ('called', 18), ('information', 18), ('please', 17), ('expensive', 17), ('include', 16), ('north', 14), ('cheap', 14), ('town', 14), ('includes', 11), ('would', 11), ('4', 11), ('hello', 11), ('guest', 11), ('east', 10), ('hotels', 9), ('price', 8), ("'d", 8), ('3', 8), ('rating', 8), ('bed', 7), ('breakfast', 7), ('moderately', 7), ('priced', 7), ('trip', 7), ('!', 7), ("'s", 7), ('want', 6), ('yes', 6), ('stars', 6), ('b', 6), ('book', 6), ('trying', 6), ("n't", 6), ('warkworth', 6), ('moderate', 5), ('know', 5), ('range', 5), ('side', 5), ('could', 5), ('get', 5)]
[('.', 171), ('restaurant', 135), ('looking', 99), ('?', 73), ('centre', 59), ('food', 55), ("'m", 55), ('find', 54), ('place', 50), ('ex

In [13]:
hotel_words = {"hotel","hotels","stay","free","star","stars","guesthouse","bed","accommodation","accommodations"}
restaurant_words = {"restaurant", "food","eat","serves", "restaurants","indian","chinese","italian","asian"}

In [14]:
def get_type(words):
    hotel_word_count = len(hotel_words & words)
    restaurant_word_count = len(restaurant_words & words)
    if hotel_word_count > restaurant_word_count:
        return "find_hotel"
    else:
        return "find_restaurant"


In [15]:
from collections import defaultdict,Counter

def get_slot_vocab(slots,filename):
    slot_vocab = defaultdict(set)
    with open(filename,encoding="utf-8") as f:
        for line in f:
            slot_dict = {pair.split("=")[0]: pair.split("=")[1] for pair in line.strip().split("|")[1:]}
            for slot,value in slot_dict.items():
                if slot in slots:
                    slot_vocab[slot].add(value)
    return slot_vocab

        

In [16]:
get_slot_vocab(["restaurant-food","restaurant-pricerange","restaurant-area","hotel-area","hotel-stars","hotel-type"],"data/WOZ_test_ans.txt")

defaultdict(set,
            {'hotel-area': {'centre', 'east', 'north', 'south', 'west'},
             'hotel-stars': {'0', '1', '2', '3', '4', '5'},
             'restaurant-food': {'afghan',
              'african',
              'afternoon tea',
              'asian',
              'asian oriental',
              'belgian',
              'brazilian',
              'british',
              'canapes',
              'caribbean',
              'catalan',
              'chinese',
              'corsica',
              'danish',
              'eastern european',
              'eritrean',
              'european',
              'french',
              'gastropub',
              'halal',
              'hungarian',
              'indian',
              'international',
              'italian',
              'jamaican',
              'mediterranean',
              'mexican',
              'modern european',
              'modern global',
              'molecular gastronomy',
              'no

In [17]:
internet_words = {"wifi","internet"}
parking_words = {"parking","park"}

In [18]:
hotel_slot_order = ["hotel-area","hotel-internet", "hotel-name", "hotel-parking","hotel-pricerange", "hotel-stars", "hotel-type"]

In [19]:
restaurant_slot_order = ["restaurant-area","restaurant-food","restaurant-name","restaurant-pricerange"]

In [20]:
name_indicators = {"called","named","call","name"}

def get_name(sentence):
    for i in range(len(sentence)):
        if sentence[i] in name_indicators:
            j = i + 1
            while j < len(sentence) and sentence[j].isalpha():
                j += 1
            if sentence[i+1] == "the" or sentence[i+1] == "of":
                i+=1
            return " ".join(sentence[i+1:j])

In [21]:
predict = True
if predict:
    f_request = open("data/WOZ_test_utt.txt",encoding='utf-8')  
    f_slots = open("WOZ_test_predicted_ans.txt","w",encoding='utf-8')
else:
    f_request = open("data/WOZ_dev_utt.txt",encoding='utf-8')
    f_slots = open("data/WOZ_dev_ans.txt",encoding='utf-8')
    correct = 0
    total = 0
slot_vocab = get_slot_vocab(["restaurant-food","restaurant-pricerange","restaurant-area","hotel-area","hotel-stars","hotel-type","hotel-pricerange"],"data/WOZ_test_ans.txt")

alternate_forms = {"centre":["center"],"moderate":["moderately"], "0":["zero","0-star"],"1":["one","1-star"], "2":["two","2-star"], "3":["three","3-star"],"4":["four","4-star"],"5":["five","5-star"]}
rev_form_lookup = {}

for canonical, alts in alternate_forms.items():
    for alt in alts:
        rev_form_lookup[alt] = canonical
        
for slot,vocab in slot_vocab.items():
    for word in set(vocab):
        if word in alternate_forms:
            vocab.update(alternate_forms[word])

print(rev_form_lookup)
print(slot_vocab["hotel-stars"])
            
for line in f_request:
    if not predict:
        answer = f_slots.readline().strip() 
    sent = [word.lower() for word in word_tokenize(line.strip())]
    words = set([word for word in sent if word not in stopwords])
    request_type = get_type(words)
    if request_type == "find_hotel":
        final_slots = {}
        if words & internet_words:
            final_slots["hotel-internet"] = "yes"
        if words & parking_words:
            if "n't" in words:
                final_slots["hotel-parking"] = "no"                
            else:
                final_slots["hotel-parking"] = "yes" 
        if "guesthouse" in words:
            final_slots["hotel-type"] = "guesthouse"
        for slot in ["hotel-area","hotel-stars","hotel-pricerange"]:
            overlap = slot_vocab[slot] & words
            if overlap:
                final_slots[slot] = list(overlap)[0]
        for slot,value in final_slots.items():
            if value in rev_form_lookup:
                final_slots[slot] = rev_form_lookup[value]
            
        name = get_name(sent)
        if name:
            final_slots["hotel-name"] = name
            
        output = ["find_hotel"]
        for slot in hotel_slot_order:
            if slot in final_slots:
                output.append(slot+"="+final_slots[slot])
        output = "|".join(output)
    else:
        final_slots = {}
        for slot in ["restaurant-area","restaurant-food","restaurant-pricerange"]:
            overlap = slot_vocab[slot] & words
            if overlap:
                final_slots[slot] = list(overlap)[0]       

        for slot,value in final_slots.items():
            if value in rev_form_lookup:
                final_slots[slot] = rev_form_lookup[value]
            
        name = get_name(sent)
        if name:
            final_slots["restaurant-name"] = name
            
        output = ["find_restaurant"]
        for slot in restaurant_slot_order:
            if slot in final_slots:
                output.append(slot+"="+final_slots[slot]) 
        output = "|".join(output)                
    
    if predict:
        f_slots.write(output+"\n")
    else:       
        if answer == output:
            correct += 1
        else:
            print(line.strip())
            print(output)
            print(answer)
            print(words)
        total += 1
        print(correct/total)

f_request.close()
f_slots.close()
    

{'center': 'centre', 'moderately': 'moderate', 'zero': '0', '0-star': '0', 'one': '1', '1-star': '1', 'two': '2', '2-star': '2', 'three': '3', '3-star': '3', 'four': '4', '4-star': '4', 'five': '5', '5-star': '5'}
{'two', '0', 'three', 'four', '0-star', '5', '5-star', 'zero', 'one', '3-star', '2-star', '2', '1-star', '4-star', '1', 'five', '4', '3'}
