# Task-Oriented Dialogue Dataset (Exploration)


---


**Practice Module: Practical Language Processing (PLP)**

**Group: 17**

Members:

`Lim Jun Ming`,

`Tadhg Kennedy`, 

`Gopan Ravikumar Girija`,

# 0. Initialization

## Imports

In [1]:
import os
import json
import pandas as pd
import numpy as np

In [2]:
basedir = os.getcwd()
print(basedir)

D:\Education (Local)\NUS-ISS Mtech IS\Course Materials\4. Practical Language Processing (PLP)\0. PLP Project\Code\Dataset


In [3]:
os.listdir(basedir)

['.ipynb_checkpoints',
 'FewShotWoZ',
 'IMDB_Data',
 'IMDB_Data_Extraction.ipynb',
 'MultiWoZ',
 'SGD',
 'Task_Oriented_Dialogue_Dataset.ipynb']

# 1. SGD Dataset

Source: https://github.com/google-research-datasets/dstc8-schema-guided-dialogue

## Load Dataset

In [4]:
# Data path
datadir1_train = os.path.join(basedir, 'SGD\\train')
datadir1_dev = os.path.join(basedir, 'SGD\\dev')
datadir1_test = os.path.join(basedir, 'SGD\\test')
print(datadir1_train)
print(datadir1_dev)
print(datadir1_test)


D:\Education (Local)\NUS-ISS Mtech IS\Course Materials\4. Practical Language Processing (PLP)\0. PLP Project\Code\Dataset\SGD\train
D:\Education (Local)\NUS-ISS Mtech IS\Course Materials\4. Practical Language Processing (PLP)\0. PLP Project\Code\Dataset\SGD\dev
D:\Education (Local)\NUS-ISS Mtech IS\Course Materials\4. Practical Language Processing (PLP)\0. PLP Project\Code\Dataset\SGD\test


In [5]:
# Functions to compile dataset and structure in DataFrame

def generate_data_df(listofpaths):
    diaID = []
    speaker = []
    text = []
    service = []
    intent = [] 
    slot = []
    slot_loc = []
    
    for path in listofpaths:
        file = open(path)
        raw_data = json.loads(file.read())
        file.close()

        for i in range(len(raw_data)): # For each dialogue
            dialog_id = raw_data[i]['dialogue_id']

            for j in range(len(raw_data[i]['turns'])): # For each conversation turn
                spkr = raw_data[i]['turns'][j]['speaker']
                uttr = raw_data[i]['turns'][j]['utterance']

                cur_service = []
                cur_intent = []
                cur_slot = []
                cur_slot_loc = []

                for k in range(len(raw_data[i]['turns'][j]['frames'])): # For each frames
                    cur_service.append(raw_data[i]['turns'][j]['frames'][k]['service'])

                    for l in range(len(raw_data[i]['turns'][j]['frames'][k]['actions'])): # For each action within each frame
                        cur_intent.append(raw_data[i]['turns'][j]['frames'][k]['actions'][l]['act'])

                    for m in range(len(raw_data[i]['turns'][j]['frames'][k]['slots'])): # For each slot within each frame
                        cur_slot.append(raw_data[i]['turns'][j]['frames'][k]['slots'][m]['slot'])
                        cur_slot_loc.append((raw_data[i]['turns'][j]['frames'][k]['slots'][m]['start'], raw_data[i]['turns'][j]['frames'][k]['slots'][m]['exclusive_end']))

                # Appending to main list
                diaID.append(dialog_id)
                speaker.append(spkr)
                text.append(uttr)
                service.append(cur_service)
                intent.append(cur_intent)
                slot.append(cur_slot)
                slot_loc.append(cur_slot_loc)


    # Forming DataFrame

    df = pd.DataFrame(np.array([diaID, speaker, text, service, intent, slot, slot_loc]).T, columns=['Dialogue_ID', 'Speaker', 'Text', 'Service', 'Intent', 'Slot', 'Slot_Location'])

    return df

In [6]:
# Generate SGD Dataset df

sgd_train_paths = [os.path.join(datadir1_train, file) for file in os.listdir(datadir1_train) if file != 'schema.json']
sgd_dev_paths = [os.path.join(datadir1_dev, file) for file in os.listdir(datadir1_dev) if file != 'schema.json']
sgd_test_paths = [os.path.join(datadir1_test, file) for file in os.listdir(datadir1_test) if file != 'schema.json']

sgd_train_df = generate_data_df(sgd_train_paths)
sgd_dev_df = generate_data_df(sgd_dev_paths)
sgd_test_df = generate_data_df(sgd_test_paths)

  df = pd.DataFrame(np.array([diaID, speaker, text, service, intent, slot, slot_loc]).T, columns=['Dialogue_ID', 'Speaker', 'Text', 'Service', 'Intent', 'Slot', 'Slot_Location'])


## Check Dataset

In [11]:
sgd_sys_df = pd.concat([sgd_train_df[sgd_train_df['Speaker']=='SYSTEM'],  sgd_dev_df[sgd_dev_df['Speaker']=='SYSTEM'], sgd_test_df[sgd_test_df['Speaker']=='SYSTEM']])


print('Number of utterances        :', len(sgd_sys_df))
# print('Number of USER utterances   :', (sgd_train_df['Speaker']=='USER').sum())
print('Number of SYSTEM utterances :', (sgd_sys_df['Speaker']=='SYSTEM').sum())
print('-' * 70)
print('All types of Service:')
service_list = np.array([x[:-2] for uttr in sgd_sys_df['Service'] for x in uttr])
print(len(np.unique(service_list)))
print(np.unique(service_list))
print('-' * 70)
print('All types of Intent:')
intent_list = np.array([x for uttr in sgd_sys_df['Intent'] for x in uttr])
print(len(np.unique(intent_list)))
print(np.unique(intent_list))
print('-' * 70)
print('All types of Slot:')
slot_list = np.array([x for uttr in sgd_sys_df['Slot'] for x in uttr])
print(len(np.unique(slot_list)))
print(np.unique(slot_list))
print('-' * 70)

sgd_sys_df.head()

Number of utterances        : 231642
Number of SYSTEM utterances : 231642
----------------------------------------------------------------------
All types of Service:
20
['Alarm' 'Banks' 'Buses' 'Calendar' 'Events' 'Flights' 'Homes' 'Hotels'
 'Media' 'Messaging' 'Movies' 'Music' 'Payment' 'RentalCars' 'Restaurants'
 'RideSharing' 'Services' 'Trains' 'Travel' 'Weather']
----------------------------------------------------------------------
All types of Intent:
10
['CONFIRM' 'GOODBYE' 'INFORM' 'INFORM_COUNT' 'NOTIFY_FAILURE'
 'NOTIFY_SUCCESS' 'OFFER' 'OFFER_INTENT' 'REQUEST' 'REQ_MORE']
----------------------------------------------------------------------
All types of Slot:
122
['account_balance' 'actors' 'address' 'address_of_location'
 'aggregate_rating' 'alarm_name' 'alarm_time' 'album' 'amount'
 'appointment_date' 'appointment_time' 'approximate_ride_duration'
 'artist' 'attraction_name' 'available_end_time' 'available_start_time'
 'average_rating' 'balance' 'car_name' 'cast' 'categ

Unnamed: 0,Dialogue_ID,Speaker,Text,Service,Intent,Slot,Slot_Location
1,1_00000,SYSTEM,Do you have a specific which you want the eati...,[Restaurants_1],[REQUEST],[],[]
3,1_00000,SYSTEM,"Is there a specific cuisine type you enjoy, su...",[Restaurants_1],[REQUEST],"[cuisine, cuisine]","[(52, 59), (61, 68)]"
5,1_00000,SYSTEM,I see that at 71 Saint Peter there is a good r...,[Restaurants_1],"[OFFER, OFFER]","[restaurant_name, city]","[(14, 28), (68, 76)]"
7,1_00000,SYSTEM,If you want to go to this restaurant you can f...,[Restaurants_1],[INFORM],[street_address],"[(56, 81)]"
9,1_00000,SYSTEM,If you want to phone them you can at 408-971-8...,[Restaurants_1],[INFORM],[phone_number],"[(37, 49)]"


In [8]:
# Check DataFrame
# Some Stats
a
print('Number of utterances        :', len(sgd_train_df))
print('Number of USER utterances   :', (sgd_train_df['Speaker']=='USER').sum())
print('Number of SYSTEM utterances :', (sgd_train_df['Speaker']=='SYSTEM').sum())
print('-' * 70)
print('All types of Service:')
service_list = np.array([x for uttr in sgd_train_df['Service'] for x in uttr])
print(np.unique(service_list))
print('-' * 70)
print('All types of Intent:')
intent_list = np.array([x for uttr in sgd_train_df['Intent'] for x in uttr])
print(np.unique(intent_list))
print('-' * 70)
print('All types of Slot:')
slot_list = np.array([x for uttr in sgd_train_df['Slot'] for x in uttr])
print(np.unique(slot_list))
print('-' * 70)

sgd_train_df.head()


Number of utterances        : 329964
Number of USER utterances   : 164982
Number of SYSTEM utterances : 164982
----------------------------------------------------------------------
All types of Service:
['Banks_1' 'Buses_1' 'Buses_2' 'Calendar_1' 'Events_1' 'Events_2'
 'Flights_1' 'Flights_2' 'Homes_1' 'Hotels_1' 'Hotels_2' 'Hotels_3'
 'Media_1' 'Movies_1' 'Music_1' 'Music_2' 'RentalCars_1' 'RentalCars_2'
 'Restaurants_1' 'RideSharing_1' 'RideSharing_2' 'Services_1' 'Services_2'
 'Services_3' 'Travel_1' 'Weather_1']
----------------------------------------------------------------------
All types of Intent:
['AFFIRM' 'AFFIRM_INTENT' 'CONFIRM' 'GOODBYE' 'INFORM' 'INFORM_COUNT'
 'INFORM_INTENT' 'NEGATE' 'NEGATE_INTENT' 'NOTIFY_FAILURE'
 'NOTIFY_SUCCESS' 'OFFER' 'OFFER_INTENT' 'REQUEST' 'REQUEST_ALTS'
 'REQ_MORE' 'SELECT' 'THANK_YOU']
----------------------------------------------------------------------
All types of Slot:
['address' 'address_of_location' 'album' 'amount' 'appointment_dat

Unnamed: 0,Dialogue_ID,Speaker,Text,Service,Intent,Slot,Slot_Location
0,1_00000,USER,I am feeling hungry so I would like to find a ...,[Restaurants_1],[INFORM_INTENT],[],[]
1,1_00000,SYSTEM,Do you have a specific which you want the eati...,[Restaurants_1],[REQUEST],[],[]
2,1_00000,USER,I would like for it to be in San Jose.,[Restaurants_1],[INFORM],[city],"[(29, 37)]"
3,1_00000,SYSTEM,"Is there a specific cuisine type you enjoy, su...",[Restaurants_1],[REQUEST],"[cuisine, cuisine]","[(52, 59), (61, 68)]"
4,1_00000,USER,I usually like eating the American type of food.,[Restaurants_1],[INFORM],[cuisine],"[(26, 34)]"


In [15]:
# Check DataFrame
# Some Stats

print('Number of utterances        :', len(sgd_dev_df))
print('Number of USER utterances   :', (sgd_dev_df['Speaker']=='USER').sum())
print('Number of SYSTEM utterances :', (sgd_dev_df['Speaker']=='USER').sum())
print('-' * 70)
print('All types of Service:')
service_list = np.array([x for uttr in sgd_dev_df['Service'] for x in uttr])
print(np.unique(service_list))
print('-' * 70)
print('All types of Intent:')
intent_list = np.array([x for uttr in sgd_dev_df['Intent'] for x in uttr])
print(np.unique(intent_list))
print('-' * 70)
print('All types of Slot:')
slot_list = np.array([x for uttr in sgd_dev_df['Slot'] for x in uttr])
print(np.unique(slot_list))
print('-' * 70)

sgd_dev_df.head()

Number of utterances        : 48726
Number of USER utterances   : 24363
Number of SYSTEM utterances : 24363
----------------------------------------------------------------------
All types of Service:
['Alarm_1' 'Banks_2' 'Buses_1' 'Events_1' 'Flights_3' 'Homes_1' 'Hotels_1'
 'Hotels_4' 'Media_2' 'Movies_2' 'Music_1' 'RentalCars_1' 'Restaurants_2'
 'RideSharing_1' 'Services_4' 'Travel_1' 'Weather_1']
----------------------------------------------------------------------
All types of Intent:
['AFFIRM' 'AFFIRM_INTENT' 'CONFIRM' 'GOODBYE' 'INFORM' 'INFORM_COUNT'
 'INFORM_INTENT' 'NEGATE' 'NEGATE_INTENT' 'NOTIFY_FAILURE'
 'NOTIFY_SUCCESS' 'OFFER' 'OFFER_INTENT' 'REQUEST' 'REQUEST_ALTS'
 'REQ_MORE' 'SELECT' 'THANK_YOU']
----------------------------------------------------------------------
All types of Slot:
['account_balance' 'actors' 'address' 'address_of_location'
 'aggregate_rating' 'alarm_name' 'alarm_time' 'album' 'appointment_date'
 'appointment_time' 'approximate_ride_duration' 'are

Unnamed: 0,Dialogue_ID,Speaker,Text,Service,Intent,Slot,Slot_Location
0,1_00000,USER,I want to make a restaurant reservation for 2 ...,[Restaurants_2],"[INFORM, INFORM, INFORM_INTENT]",[time],"[(56, 83)]"
1,1_00000,SYSTEM,What city do you want to dine in? Do you have ...,[Restaurants_2],"[REQUEST, REQUEST]",[],[]
2,1_00000,USER,Please find restaurants in San Jose. Can you t...,[Restaurants_2],"[INFORM, INFORM]","[location, restaurant_name]","[(27, 35), (49, 53)]"
3,1_00000,SYSTEM,Confirming: I will reserve a table for 2 peopl...,[Restaurants_2],"[CONFIRM, CONFIRM, CONFIRM, CONFIRM, CONFIRM]","[restaurant_name, location, time, date]","[(51, 55), (59, 67), (93, 101), (102, 107)]"
4,1_00000,USER,"Yes, thanks. What's their phone number?",[Restaurants_2],"[REQUEST, AFFIRM]",[],[]


In [16]:
# Check DataFrame
# Some Stats

print('Number of utterances        :', len(sgd_test_df))
print('Number of USER utterances   :', (sgd_test_df['Speaker']=='USER').sum())
print('Number of SYSTEM utterances :', (sgd_test_df['Speaker']=='USER').sum())
print('-' * 70)
print('All types of Service:')
service_list = np.array([x for uttr in sgd_test_df['Service'] for x in uttr])
print(np.unique(service_list))
print('-' * 70)
print('All types of Intent:')
intent_list = np.array([x for uttr in sgd_test_df['Intent'] for x in uttr])
print(np.unique(intent_list))
print('-' * 70)
print('All types of Slot:')
slot_list = np.array([x for uttr in sgd_test_df['Slot'] for x in uttr])
print(np.unique(slot_list))
print('-' * 70)

sgd_test_df.head()


Number of utterances        : 84594
Number of USER utterances   : 42297
Number of SYSTEM utterances : 42297
----------------------------------------------------------------------
All types of Service:
['Alarm_1' 'Buses_3' 'Events_3' 'Flights_4' 'Homes_2' 'Hotels_2'
 'Hotels_4' 'Media_3' 'Messaging_1' 'Movies_1' 'Movies_3' 'Music_3'
 'Payment_1' 'RentalCars_3' 'Restaurants_2' 'RideSharing_2' 'Services_1'
 'Services_4' 'Trains_1' 'Travel_1' 'Weather_1']
----------------------------------------------------------------------
All types of Intent:
['AFFIRM' 'AFFIRM_INTENT' 'CONFIRM' 'GOODBYE' 'INFORM' 'INFORM_COUNT'
 'INFORM_INTENT' 'NEGATE' 'NEGATE_INTENT' 'NOTIFY_FAILURE'
 'NOTIFY_SUCCESS' 'OFFER' 'OFFER_INTENT' 'REQUEST' 'REQUEST_ALTS'
 'REQ_MORE' 'SELECT' 'THANK_YOU']
----------------------------------------------------------------------
All types of Slot:
['address' 'alarm_name' 'alarm_time' 'album' 'amount' 'appointment_date'
 'appointment_time' 'area' 'artist' 'attraction_name' 'avera

Unnamed: 0,Dialogue_ID,Speaker,Text,Service,Intent,Slot,Slot_Location
0,1_00000,USER,"Hi, could you get me a restaurant booking on t...",[Restaurants_2],"[INFORM, INFORM_INTENT]",[date],"[(45, 52)]"
1,1_00000,SYSTEM,"Any preference on the restaurant, location and...",[Restaurants_2],"[REQUEST, REQUEST, REQUEST]",[],[]
2,1_00000,USER,Could you get me a reservation at P.f. Chang's...,[Restaurants_2],"[INFORM, INFORM, INFORM]","[restaurant_name, time, location]","[(34, 46), (66, 78), (50, 62)]"
3,1_00000,SYSTEM,Please confirm your reservation at P.f. Chang'...,[Restaurants_2],"[CONFIRM, CONFIRM, CONFIRM, CONFIRM, CONFIRM]","[restaurant_name, location, time, date]","[(35, 47), (51, 63), (67, 72), (82, 91)]"
4,1_00000,USER,"Sure, that is great.",[Restaurants_2],[AFFIRM],[],[]


## Preprocess to Code Form

In [17]:
## Code each dialogue utterance into code form which is a dictionary
## 'Domain' - specific domain or service for the utterance
## 'Acts' - a list of dictionary of actions frame where 
##          each action frame is a dictionary with keys 'action' and slots where
##                slots is a list of dictionaries of slot_name and corresponding values

def convert_to_data_code(data_paths, target_speaker):
    data_code = []

    for path in data_paths:
        file = open(path)
        data = json.loads(file.read())
        file.close()

        for i in range(len(data)): # For each dialogue
            dialogue = data[i]

            for j in range(len(dialogue['turns'])): # For each turn (utterance)
                
                if dialogue['turns'][j]['speaker'] == target_speaker:
                    utterance = dialogue['turns'][j]['utterance']

                    services = {}
                    for k in range(len(dialogue['turns'][j]['frames'])): # For each service found in utterance
                        service = dialogue['turns'][j]['frames'][k]['service'][:-2] # Normalizes service category Event_1 -> Event | Restaurant_2 -> Restaurant etc.

                        action_slots = {}

                        for l in range(len(dialogue['turns'][j]['frames'][k]['actions'])): # For each actions in a service
                            act = dialogue['turns'][j]['frames'][k]['actions'][l]['act']

                            if action_slots.get(act) is None:
                                if dialogue['turns'][j]['frames'][k]['actions'][l]['slot'] != "": # If slot name is not empty
                                    if dialogue['turns'][j]['frames'][k]['actions'][l]['values'] != []: # if slot value is not empty
                                        slot_value = [dialogue['turns'][j]['frames'][k]['actions'][l]['slot'], dialogue['turns'][j]['frames'][k]['actions'][l]['values']]
                                    else: # if slot value is empty
                                        slot_value = [dialogue['turns'][j]['frames'][k]['actions'][l]['slot']]
                                    action_slots[act] = [slot_value]
                                else: 
                                    action_slots[act] = None
                            else:
                                if dialogue['turns'][j]['frames'][k]['actions'][l]['slot'] != "": # If slot name is not empty
                                    if dialogue['turns'][j]['frames'][k]['actions'][l]['values'] != []: # if slot value is not empty
                                        slot_value = [dialogue['turns'][j]['frames'][k]['actions'][l]['slot'], dialogue['turns'][j]['frames'][k]['actions'][l]['values']]
                                    else: # if slot value is empty
                                        slot_value = [dialogue['turns'][j]['frames'][k]['actions'][l]['slot']]
                                    action_slots[act].append(slot_value)
                                else: 
                                    action_slots[act] = None

                        services[service] = action_slots

                    sample = (services, utterance) # Tuple of dictionary code and original utterance
                    data_code.append(sample)
                
    return data_code
                

In [42]:
if True and False:
    print(1)
else:
    print(0)

0


In [69]:
# Convert the dictionary data code to linear string code form:

request_slots = 'REQUEST'

def process_to_code(dict_code):
    processed_code = []
    for uttr in dict_code:
        to_process = uttr[0]

        service_code = ''

        for i in range(len(to_process.keys())):

            service = list(to_process.keys())[i]
            service_actions = to_process[service]
            
            # Append service to lienar string
            
            if i != 0:
                service_code += ' | '
            

            for j in range(len(service_actions.keys())):
                
                action = list(service_actions.keys())[j]
                slot_value_pairs = service_actions[action]
                
                # Append service to lienar string
                if j != 0:
                    service_code += ' | '
                
                service_code += service + ' '
                service_code += action

                if slot_value_pairs is not None:
                    # service_code += ' ( '
                
                    for k in range(len(slot_value_pairs)):
                        if k != 0:
                            service_code += ' ; '

                        slot_value = slot_value_pairs[k]

                        if len(slot_value) == 2:
                            service_code += slot_value[0] + ' = ' + ' , '.join(slot_value[1])
                        
                        elif len(slot_value) == 1 and action == request_slots:
                            service_code += slot_value[0] + ' = ' + '?'
                        
                        elif len(slot_value) == 1:
                            service_code += slot_value[0]

                    # service_code += ' )'

        service_code = service_code.strip()
        processed_code.append([service_code, uttr[1]]) # List of data code and original utterance
    return processed_code

                                     

In [70]:
# Process System Speaker Data
target_speaker = 'SYSTEM'

sgd_train_data_code_sys = convert_to_data_code(sgd_train_paths, target_speaker)
sgd_dev_data_code_sys = convert_to_data_code(sgd_dev_paths, target_speaker)
sgd_test_data_code_sys = convert_to_data_code(sgd_test_paths, target_speaker)

sgd_train_code_sys = process_to_code(sgd_train_data_code_sys)
sgd_dev_code_sys = process_to_code(sgd_dev_data_code_sys)
sgd_test_code_sys = process_to_code(sgd_test_data_code_sys)


In [62]:
# Process User Speaker Data
target_speaker = 'USER'

sgd_train_data_code_usr = convert_to_data_code(sgd_train_paths, target_speaker)
sgd_dev_data_code_usr = convert_to_data_code(sgd_dev_paths, target_speaker)
sgd_test_data_code_usr = convert_to_data_code(sgd_test_paths, target_speaker)

sgd_train_code_usr = process_to_code(sgd_train_data_code_usr)
sgd_dev_code_usr = process_to_code(sgd_dev_data_code_usr)
sgd_test_code_usr = process_to_code(sgd_test_data_code_usr)

In [24]:
# Samples 1
 
sample_id = 82798
print('-' * 70)
print('DataFrame Format:')
print('-' * 70)
print(sgd_train_df.iloc[sample_id, :])

print('-' * 70)
print('Dictionary Code Format:')
print('-' * 70)
print(sgd_train_data_code_usr[int(sample_id/2)])

print('-' * 70)
print('Data Code Format:')
print('-' * 70)
print(sgd_train_code_usr[int(sample_id/2)])


----------------------------------------------------------------------
DataFrame Format:
----------------------------------------------------------------------
Dialogue_ID                                               44_00007
Speaker                                                       USER
Text             Not yet - I need to get bus tickets, first. I ...
Service                                        [Events_2, Buses_2]
Intent           [NEGATE_INTENT, INFORM, INFORM, INFORM, INFORM...
Slot                                      [departure_time, origin]
Slot_Location                               [(106, 113), (95, 98)]
Name: 82798, dtype: object
----------------------------------------------------------------------
Dictionary Code Format:
----------------------------------------------------------------------
({'Events': {'NEGATE_INTENT': None}, 'Buses': {'INFORM': [['departure_time', ['6:30 pm']], ['origin', ['LAX']], ['fare_type', ['Economy extra']]], 'INFORM_INTENT': [['intent', ['

In [35]:
# Samples 2

sample_id = 45201
print('-' * 70)
print('DataFrame Format:')
print('-' * 70)
print(sgd_dev_df.iloc[sample_id, :])

print('-' * 70)
print('Dictionary Code Format:')
print('-' * 70)
print(sgd_dev_data_code_sys[int(sample_id/2)])

print('-' * 70)
print('Data Code Format:')
print('-' * 70)
print(sgd_dev_code_sys[int(sample_id/2)])

----------------------------------------------------------------------
DataFrame Format:
----------------------------------------------------------------------
Dialogue_ID                                               19_00075
Speaker                                                     SYSTEM
Text             confirm the next step: I'll book a visit to an...
Service                                                  [Homes_1]
Intent                                          [CONFIRM, CONFIRM]
Slot                                   [property_name, visit_date]
Slot_Location                                 [(60, 76), (80, 89)]
Name: 45201, dtype: object
----------------------------------------------------------------------
Dictionary Code Format:
----------------------------------------------------------------------
({'Homes': {'CONFIRM': [['property_name', ['casa pino condos']], ['visit_date', ['march 5th']]]}}, "confirm the next step: I'll book a visit to an apartment in casa pino condos o

In [31]:
# Samples 3

sample_id = 5202
print('-' * 70)
print('DataFrame Format:')
print('-' * 70)
print(sgd_test_df.iloc[sample_id, :])

print('-' * 70)
print('Dictionary Code Format:')
print('-' * 70)
print(sgd_test_data_code_usr[int(sample_id/2)])

print('-' * 70)
print('Data Code Format:')
print('-' * 70)
print(sgd_test_code_usr[int(sample_id/2)])

----------------------------------------------------------------------
DataFrame Format:
----------------------------------------------------------------------
Dialogue_ID                                                4_00041
Speaker                                                       USER
Text             Yes please, at 5 o"clock in the evening on 11t...
Service                                            [Restaurants_2]
Intent                             [AFFIRM_INTENT, INFORM, INFORM]
Slot                                                  [date, time]
Slot_Location                                 [(43, 61), (15, 39)]
Name: 5202, dtype: object
----------------------------------------------------------------------
Dictionary Code Format:
----------------------------------------------------------------------
({'Restaurants': {'AFFIRM_INTENT': None, 'INFORM': [['date', ['11th of this month']], ['time', ['5 o"clock in the evening']]]}}, 'Yes please, at 5 o"clock in the evening on 11th o

In [60]:
print(sgd_train_data_code_sys[0])
print(sgd_train_code_sys[0])
print('--')
process_to_code([sgd_train_data_code_sys[0]])

({'Restaurants': {'REQUEST': [['city']]}}, 'Do you have a specific which you want the eating place to be located at?')
['Restaurants REQUEST ( city = ? )', 'Do you have a specific which you want the eating place to be located at?']
--


[['Restaurants REQUEST ( city = ? )',
  'Do you have a specific which you want the eating place to be located at?']]

## Save Dataset in CSV

In [108]:
# Save datafram to csv

sgd_train_df_save = os.path.join(basedir, 'SGD\\SGD_Train.csv')
sgd_dev_df_save = os.path.join(basedir, 'SGD\\SGD_Dev.csv')
sgd_test_df_save = os.path.join(basedir, 'SGD\\SGD_Test.csv')

sgd_train_df.to_csv(sgd_train_df_save)
sgd_dev_df.to_csv(sgd_dev_df_save)
sgd_test_df.to_csv(sgd_test_df_save)

In [63]:
# Save data code to csv

sgd_train_code_df_sys = pd.DataFrame(sgd_train_code_sys, columns=['Code', 'Text'])
sgd_dev_code_df_sys = pd.DataFrame(sgd_dev_code_sys, columns=['Code', 'Text'])
sgd_test_code_df_sys = pd.DataFrame(sgd_test_code_sys, columns=['Code', 'Text'])

# sgd_train_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Train_Code_SYS.csv')
# sgd_dev_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Dev_Code_SYS.csv')
# sgd_test_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Test_Code_SYS.csv')

sgd_train_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Train_Code_SYS_2.csv')
sgd_dev_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Dev_Code_SYS_2.csv')
sgd_test_df_code_sys_save = os.path.join(basedir, 'SGD\\SGD_Test_Code_SYS_2.csv')

sgd_train_code_df_sys.to_csv(sgd_train_df_code_sys_save)
sgd_dev_code_df_sys.to_csv(sgd_dev_df_code_sys_save)
sgd_test_code_df_sys.to_csv(sgd_test_df_code_sys_save)


In [128]:
# Number of cases

print(len(sgd_train_code_df_sys))
print(len(sgd_dev_code_df_sys))
print(len(sgd_test_code_df_sys))

164982
24363
42297
