In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [2]:
woz_directory = "data/"

In [33]:
def data_processor(split):
    with open(woz_directory + f"WOZ_{split}_utt.txt") as f1:
        data_utts = [s.strip() for s in f1.readlines()]
    with open(woz_directory + f"WOZ_{split}_ans.txt") as f2:
        data_ans = [s.strip() for s in f2.readlines()]
    data_ans_first = []
    data_ans_second = []
    for ans in data_ans:
        ans_split = ans.split("|")
        type_dict = {}
        data_ans_first.append(ans_split[0])
        for type_ in ans_split[1:]:
            type_split = type_.split("-")[1].split("=")
            type_dict[type_split[0]] = type_split[1]
        data_ans_second.append(type_dict)
    
    data_find_hotel_ids = [idx for idx, ans in enumerate(data_ans) if ans.split("|")[0] == "find_hotel"]
    data_hotel_ans = np.array(data_ans_second)[data_find_hotel_ids]
    data_hotel_utts = np.array(data_utts)[data_find_hotel_ids]
    hotel_df = pd.DataFrame.from_dict(list(data_hotel_ans)).fillna("")
    hotel_df['utts'] = data_hotel_utts
    
    data_find_rest_ids = [idx for idx, ans in enumerate(data_ans) if ans.split("|")[0] == "find_restaurant"]
    data_rest_ans = np.array(data_ans_second)[data_find_rest_ids]
    data_rest_utts = np.array(data_utts)[data_find_rest_ids]
    rest_df = pd.DataFrame.from_dict(list(data_rest_ans)).fillna("")
    rest_df['utts'] = data_rest_utts
    
    hotel_df.to_csv(f"./data/{split}_hotel.csv", index=False)
    rest_df.to_csv(f"./data/{split}_restaurant.csv", index=False)
    
    h = ["find_hotel" for _ in range(hotel_df.shape[0])]
    r = ["find_restaurant" for _ in range(rest_df.shape[0])]
    df = pd.DataFrame({"utts": list(data_hotel_utts) + list(data_rest_utts), "labels": h + r})
    df.to_csv(f"./data/{split}.csv", index=False)

In [34]:
data_processor("train")

In [35]:
data_processor("dev")

In [52]:
def get_bies(sentence, tags, type_, curr_tags):
    tags = tags.split()
    check = False
    idx = 0
    for ix, word in enumerate(sentence):
        if (idx < len(tags)) and (word == tags[idx]):
            if idx == 0:
                curr_tags[ix] = "B-"+type_
            else:
                curr_tags[ix] = "I-"+type_
            idx += 1
        
    return curr_tags

def get_labels_dict(label):
    label_dict = {}
    label_splits = label.split("|")[1:]
    for s in label_splits:
        key = s.split("-")[1].split("=")[0]
        tag = s.split("-")[1].split("=")[1]
        label_dict[key] = tag
    return label_dict

def sentence2iob(sentence, label_dict):
    sentence = sentence.lower().split()
    curr_tags = ["O" for _ in range(len(sentence))]
    for key, value in label_dict.items():
        if type(value) == str:
            curr_tags = get_bies(sentence, value, key, curr_tags)
    sentence.append("[SEP]")
    curr_tags.append("[SEP]")
    sentence.extend(["area", "pricerange", "type", "internet", "parking", "stars"])
    curr_tags.extend(["O", "O", "O", "O", "O", "O"])
    value_dict = {"area": -6, "pricerange": -5, "type": -4, "internet": -3, "parking": -2, "stars": -1}
    for key, value in label_dict.items():
        if key in ["area", "pricerange", "type", "internet", "parking", "stars"]:
            curr_tags[value_dict[key]] = value
    return sentence, curr_tags

all_tokens = []
all_tags = []
all_dicts = []
with open("./data/WOZ_train_utt.txt", "r") as f1, open("./data/WOZ_train_ans.txt", "r") as f2:
    sentences = f1.readlines()
    labels = f2.readlines()
    for sentence, labels in zip(sentences, labels):
        sentence = sentence.strip()
        labels = labels.strip()
        label_dict = get_labels_dict(labels)
        tokens, tags = sentence2iob(sentence, label_dict)
        all_tokens.append(tokens)
        all_tags.append(tags)
        all_dicts.append(label_dict)

In [51]:
i = 0
print(all_tokens[i])
print(all_tags[i])
print(all_dicts[i])

['Guten', 'Tag,', 'I', 'am', 'staying', 'overnight', 'in', 'Cambridge', 'and', 'need', 'a', 'place', 'to', 'sleep.', 'I', 'need', 'free', 'parking', 'and', 'internet.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
{'area': 'centre', 'internet': 'yes', 'parking': 'yes'}


In [73]:
sentence

'G'