In [39]:
from __future__ import absolute_import

import os
import json
import re
import logging

from utils.nlp_utils import profanity_check

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [40]:
DSTC1_FOLDER = '/home/t-igshal/data/dstc1'

In [41]:
os.listdir(DSTC1_FOLDER)

['test4',
 'dstc_data_test3',
 'Dialog state tracking challenge handbook V21.pdf',
 'test2',
 'train2',
 'train1c',
 'train1a',
 'test1',
 'train1b',
 'dstc_data_train3_v00']

In [42]:
dstc1_utterances = set([])
dstc1_start_utterances = set([])
for root, dirs, files in os.walk(DSTC1_FOLDER):
    for filename in files:
        if re.match('.+\.labels\.json$', filename):
            with open(os.path.join(root, filename)) as session_in:
                session_json = json.load(session_in)
                turns = session_json.get('turns', [])
                if len(turns) and turns[0]['transcription-status'] == 'transcribed':
                    dstc1_start_utterances.add(turns[0]['transcription'])
                dstc1_utterances.update([turn['transcription']
                                         for turn in turns
                                         if turn.get('transcription-status') == 'transcribed'])
                
dstc1_utterances = [utt for utt in dstc1_utterances if utt]
dstc1_start_utterances = [utt for utt in dstc1_start_utterances if utt]

In [43]:
dstc1_longer_start_utterances = [utt for utt in dstc1_start_utterances if 9 < len(utt.split())]
print(len(dstc1_longer_start_utterances))
print('\n'.join(dstc1_longer_start_utterances[:50]))

968
when is the next sixty one b from oakland to wilkinsburg
what time does the next sixty one c leave from downtown
the sixty one c from homestead what hours does it leave
i need to know what time the sixty one a comes to swissvale
i want to know what time the sixty one c stops running
can i have the schedule for the fifty four c the next bus
sixty one c i think she might have missed the last one
/background/ i would like to know when the next 61a:b is coming to Kelly:p and Pitt:p
i need a bus from downtown sixty one a to shadyside hospital
arlington road to carnegie mellon university from fourth in murray
carnegie yes 61c moroeville front of transportation mckeesport to grace cambridge b street in whitehall
the next fifty nine a leaving the mall going to west mifflin
p a neville in 54c leaving at south front of stead
can you tell me when the next sixty one c from oakland to mckeesport %
could tell me when the fifty four c bus comes on bosman in knox
i know i want to know what time fo

In [45]:
with open('data/dstc1_ood.txt', 'w') as ood_out:
    for utterance in dstc1_longer_start_utterances:
        print(utterance, file=ood_out)

In [46]:
with open('data/Frames-dataset/frames.json') as frames_in:
    frames_data = json.load(frames_in)

In [49]:
frames_start_utterances = set([dialogue['turns'][0]['text'] for dialogue in frames_data])

In [52]:
frames_longer_start_utterances = [utt.replace('\n', ' ') for utt in frames_start_utterances if 5 < len(utt.split())]
print(len(frames_longer_start_utterances))
print('\n'.join(frames_longer_start_utterances[:50]))

1198
from 22nd-28nd of september i just wanna flee town and forget about work, family, my mortgage, responsibilities, my alcoholism, etc.. i am thinking Ciudad Juarez from Dalle
Hi im from neverland! and i want to go to Mos eisley
Hi i need a 5 day get away!
Hi there, I'd like to book a trip to Punta Cana.
Can you help me book a trip to Mexico City? We'll be 4 adults and 6 kids from Long Baech.
OMG. I AM LATE AT WORK DREAMING ABOUT A TRIP. I LOVE TO DREAM REAL JOHN LENNON FAN HERE SEND ME TO MILAN I GUESS I AM GETTING THE HECK OUT OF BRASILIA
Hi there! I'm about to jet out to my first ever internship I need to make sure I get this booking exactly right Can you get me to Rome?
I would like to take my fam jam on a vacation to Boston from Tijuana.
Please check if there is a flight to Naples from Fort Lauderdale leaving August 31
Looking to take my squad out to Punta Cana! I’ll pay whatever it takes to fly out of Tel Aviv
my 4 friends and i want to get away
Hello there! I have 12 days off 

In [53]:
with open('data/frames_ood.txt', 'w') as frames_out:
    for utterance in frames_longer_start_utterances:
        print(utterance, file=frames_out)