# DSTC-2 Datasets Loading

### Look into DSTC-2 and understand its structure

## Load Libraries

In [127]:
import os, json, re
from IPython.display import display, HTML
import pandas as pd

## Parameters

In [128]:
dataset_name = "dstc2_dev"
src_folder = "data/dstc2_traindev"

In [150]:
class dataset_walker(object):
        
    def __init__(self,dataset,labels=False,dataroot=None):
        if "[" in dataset :
            self.datasets = json.loads(dataset)
        elif type(dataset) == type([]) :
            self.datasets= dataset
        else:
            self.datasets = [dataset]
            self.dataset = dataset
        self.dataset_session_lists = [os.path.join(src_folder, 'scripts', 'config', dataset_name + '.flist') for dataset in self.datasets]
           
        self.labels = labels
        if (dataroot == None):
            self.dataroot = os.path.join(src_folder,'data')
        else:
            self.dataroot = os.path.join(os.path.abspath(dataroot))

        # load dataset (list of calls)
        self.session_list = []
        for dataset_session_list in self.dataset_session_lists :
            f = open(dataset_session_list)
            for line in f:
                line = line.strip()
                #line = re.sub('/',r'\\',line)
                #line = re.sub(r'\\+$','',line)
                if (line in self.session_list):
                    raise(RuntimeError,'Call appears twice: %s' % (line))
                self.session_list.append(line)
            f.close() 
    
    def session_id_to_call(self, session_id):
        session_id_list = session_id.split('/')
        session_dirname = os.path.join(self.dataroot, *session_id_list)
        applog_filename = os.path.join(session_dirname,'log.json')
        if (self.labels):
            labels_filename = os.path.join(session_dirname,'label.json')
            if (not os.path.exists(labels_filename)):
                raise(RuntimeError,'Cant score : cant open labels file %s' % (labels_filename))
        else:
            labels_filename = None
        return Call(applog_filename,labels_filename)
    
    def __getitem__(self, i):
        return self.session_id_to_call(self.session_list[i])
    
    def __iter__(self):
        print("iter")
        for session_id in self.session_list:
            yield self.session_id_to_call(session_id)
            
    def __len__(self, ):
        return len(self.session_list)

In [151]:
class Call(object):
    def __init__(self,applog_filename,labels_filename):
        self.applog_filename = applog_filename
        self.labels_filename = labels_filename
        f = open(applog_filename)
        self.log = json.load(f)
        f.close()
        if (labels_filename != None):
            f = open(labels_filename)
            self.labels = json.load(f)
            f.close()
        else:
            self.labels = None

    def __iter__(self):
        if (self.labels_filename != None):
            for (log,labels) in zip(self.log['turns'],self.labels['turns']):
                yield (log,labels)
        else: 
            for log in self.log['turns']:
                yield (log,None)
                
    def __len__(self, ):
        return len(self.log['turns'])

In [186]:
dataset = dataset_walker(dataset_name)
print("Total Datasets: {}".format(len(dataset)))

display(HTML('<h1>Session</h1>'))

log = dataset[1].log
table = pd.DataFrame.from_dict(log, orient='index')
display(table)

display(HTML('<h1>Turns</h1>'))

turns = log["turns"]
display(pd.DataFrame(turns))

display(HTML('<h1>Single Turn</h1>'))
turn = turns[0]
print(turn['input']['live'])
display(pd.DataFrame(turn))


Total Datasets: 506


Unnamed: 0,0
session-id,voip-be5694f464-20130328_125233
session-date,2013-03-28
session-time,12:52:33
caller-id,be5694f464
turns,"[{'output': {'transcript': 'Hello , welcome to..."
system-specific,"{'dialog-manager': 1, 'acoustic-condition': 0}"


Unnamed: 0,input,output,turn-index
0,{'live': {'asr-hyps': [{'asr-hyp': 'and i want...,"{'transcript': 'Hello , welcome to the Cambrid...",0
1,{'live': {'asr-hyps': [{'asr-hyp': 'i want to ...,{'transcript': 'the missing sock is a nice pla...,1
2,{'live': {'asr-hyps': [{'asr-hyp': 'thank you ...,"{'transcript': 'Sure , the missing sock is on ...",2


{'asr-hyps': [{'asr-hyp': 'and i want to find a cheap restaurant in the east part of town', 'score': -1.053452}, {'asr-hyp': 'ok i want to find a cheap restaurant in the east part of town', 'score': -1.35857}, {'asr-hyp': 'and i want to find a restaurant in the east part of town', 'score': -3.378493}, {'asr-hyp': 'i i want to find a cheap restaurant in the east part of town', 'score': -3.525941}, {'asr-hyp': 'ok i want to find a restaurant in the east part of town', 'score': -3.683611}, {'asr-hyp': 'and and i want to find a cheap restaurant in the east part of town', 'score': -3.726887}, {'asr-hyp': 'and can i want to find a cheap restaurant in the east part of town', 'score': -3.873892}, {'asr-hyp': 'can i want to find a cheap restaurant in the east part of town', 'score': -3.91824}, {'asr-hyp': 'ok and i want to find a cheap restaurant in the east part of town', 'score': -4.032005}, {'asr-hyp': 'and i want to find an cheap restaurant in the east part of town', 'score': -4.060443}], '

Unnamed: 0,input,output,turn-index
aborted,,False,0
audio-file,pt344x_0000904_0001369.wav,,0
batch,"{'cnet': [{'start': 0.0, 'end': 0.0240625, 'ar...",,0
dialog-acts,,"[{'slots': [], 'act': 'welcomemsg'}]",0
end-time,13.69,9.04,0
live,{'asr-hyps': [{'asr-hyp': 'and i want to find ...,,0
start-time,9.04,0.0013,0
transcript,,"Hello , welcome to the Cambridge restaurant sy...",0
