In [1]:
from __future__ import absolute_import

import os
import json
import re
import logging
import zipfile

import requests

from utils.nlp_utils import profanity_check

logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ishalyminov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
DSTC1_FOLDER = 'data/dstc1'
DSTC1_URL = 'https://www.dl.dropboxusercontent.com/s/mlqqtdxunk2rlwm/dstc1.zip?dl=0'
if not os.path.exists(DSTC1_FOLDER):
    r = requests.get(DSTC1_URL)
    with open('dstc1.zip', 'wb') as dstc1_out:
        dstc1_out.write(r.content)
    with zipfile.ZipFile('dstc1.zip') as zip_ref:
        zip_ref.extractall('data')

In [4]:
os.listdir(DSTC1_FOLDER)

['.DS_Store',
 'dstc_data_test3',
 'dstc_data_train3_v00',
 'test1',
 'test2',
 'test4',
 'train1a',
 'train1b',
 'train1c',
 'train2']

In [5]:
dstc1_utterances = set([])
dstc1_start_utterances = set([])
for root, dirs, files in os.walk(DSTC1_FOLDER):
    for filename in files:
        if re.match('.+\.labels\.json$', filename):
            with open(os.path.join(root, filename)) as session_in:
                session_json = json.load(session_in)
                turns = session_json.get('turns', [])
                if len(turns) and turns[0]['transcription-status'] == 'transcribed':
                    dstc1_start_utterances.add(turns[0]['transcription'])
                dstc1_utterances.update([turn['transcription']
                                         for turn in turns
                                         if turn.get('transcription-status') == 'transcribed'])
                
dstc1_utterances = [utt for utt in dstc1_utterances if utt]
dstc1_start_utterances = [utt for utt in dstc1_start_utterances if utt]

In [6]:
dstc1_longer_start_utterances = [utt for utt in dstc1_start_utterances if 9 < len(utt.split())]
print(len(dstc1_longer_start_utterances))
print('\n'.join(dstc1_longer_start_utterances[:50]))

968
sixty one c leaving second and grand going downtown at ten oclock
ah i would like to know how to make paytence
can you tell me what time the next fifty one c is leaving downtown pittsburgh
i need the bus schedule for the sixty one the ebs
im trying to find out a bus i can take from downtown to go to kennywood park
i want to know what time the sixty one c leaves from downtown pittsburg going toward
wood penn hills from forbes and murray to carnegie mellon
goodbye schedule for 59a leaving swissvale forest hills north versailles
yes i need to know when the next sixty one a is leaving dynamo way going to swissvale station
can you tell me what time the last fifty five m comes
where is the next bus from squirrel hell to carnegie mellon university
know what time the sixty one c come to duquesne and go to mckeesport
id like to know about the sixty one for the morning hours %
i wanted information on bus scheduling for sixty one c please
when is there a twenty eight s at eleven oclock pm fro

In [7]:
with open('data/dstc1_ood.txt', 'w') as ood_out:
    for utterance in dstc1_longer_start_utterances:
        print(utterance, file=ood_out)

Now, get the Maluuba Frames dataset: https://www.microsoft.com/en-us/research/project/frames-dataset/

In [8]:
with open('data/Frames-dataset/frames.json') as frames_in:
    frames_data = json.load(frames_in)

In [9]:
frames_start_utterances = set([dialogue['turns'][0]['text'] for dialogue in frames_data])

In [10]:
frames_longer_start_utterances = [utt.replace('\n', ' ') for utt in frames_start_utterances if 5 < len(utt.split())]
print(len(frames_longer_start_utterances))
print('\n'.join(frames_longer_start_utterances[:50]))

1198
We have decided to go visit my sister in Sao Paulo. What do you have out of Santa Cruz?
I need to go to Buenos Aires.
I'm looking to book trip for spring break for 4 people. I have a tight budget of 4600
I need 2 get 2 Madrid from Beijing! Family emergency. Need to leave on the 6th
PUebla to Manas for 2 adults please!
Say, With just 1900 dollars and little more than a dream, could I make it from Seattle to North Vancouver?
Hello you may know me as the best Pokemon Hunter in the world
back for another crazy trip here
I need to find a vacation between today and September 14th from Punta Cana to San Francisco
Hello. I'd like to leave from Tel Aviv And go to Puebla
We have 6 adults looking to book a trip
ok the dreamer is back at it. this time i'd like to hear about trips leaving denver such as mexico city?
Hi I need a break from Gotham City, the crime rate is just too high. I'd like to go to Hogsmead with 3 kids and 9 adults.
Hi there. I have been tasked with bringing my big beautifu

In [11]:
with open('data/frames_ood.txt', 'w') as frames_out:
    for utterance in frames_longer_start_utterances:
        print(utterance, file=frames_out)