[View in Colaboratory](https://colab.research.google.com/github/kpe/notebooks/blob/master/ms_cntk_atis_dataset_reader.ipynb)

# MS CNTK ATIS DataSet Downloader

Lets try to fetch the ATIS DataSet from the MS CNTK repo in github: https://github.com/Microsoft/CNTK/tree/master/Examples/LanguageUnderstanding/ATIS/Data

and convert the data into a python friendly (pickle and text) format.

In [0]:
!pip install tqdm

In [0]:
from collections import defaultdict

from tqdm import tqdm
import numpy as np

DATA_DIR='.model_data' # fetch location


## First download the raw files

In [0]:
import os
import urllib.request
from urllib.parse import urlparse, urljoin

ATIS_REPO_URL       = "https://github.com/Microsoft/CNTK/raw/master"
ATIS_BASE_URL       = ATIS_REPO_URL+"/Examples/LanguageUnderstanding/ATIS/Data/"
ATIS_EXTRA_BASE_URL = ATIS_REPO_URL+"/Examples/LanguageUnderstanding/ATIS/BrainScript/"

ATIS_DS={
    ATIS_BASE_URL: [
        'ATIS.label',              # labels - labels_count:127
        'ATIS.test.cntk.sparse',   # featurized
        'ATIS.train.cntk.sparse',  # featurized
        'ATIS.vocab',              # words - vocab_size: 944
        'atis.test.ctf',
        'atis.train.ctf'
    ],
    ATIS_EXTRA_BASE_URL: [
        'query.wl',
        'slots.wl',
        'intent.wl'
    ]
}

def fetch_ms_atis_ds():
    for base_url,fnames in ATIS_DS.items():
        for fname in fnames:
            url = urljoin(base_url, fname)
            fname = os.path.basename(urlparse(url).path)
            loc_path = os.path.join(DATA_DIR, fname)
            if os.path.isfile(loc_path):
                print("skip downloading: {}".format(fname))
                continue
            print("     downloading: {}".format(fname))
            if not os.path.isdir(os.path.dirname(loc_path)):
                os.makedirs(os.path.dirname(loc_path))
            urllib.request.urlretrieve(url, loc_path)
            print("done downloading: {} bytes".format(os.path.getsize(loc_path)))
   
fetch_ms_atis_ds()

## pip install the python CNTK

In [0]:
!pip install cntk

In [0]:
#
# you might need:
#   ln -s /usr/lib64/libmpi_cxx.so.20 /usr/local/lib64/libmpi_cxx.so.1
#   ln -s /usr/lib64/libmpi.so.20 /usr/lib64/libmpi.so.12
#
import cntk

def build_dicts():
    query_wl   = [line.rstrip('\n') for line in open(os.path.join(DATA_DIR,"query.wl"))]
    slots_wl   = [line.rstrip('\n') for line in open(os.path.join(DATA_DIR,"slots.wl"))]
    intents_wl = [line.rstrip('\n') for line in open(os.path.join(DATA_DIR,"intent.wl"))]
    query_dict   = {query_wl[i]:i   for i in range(len(query_wl))}
    slots_dict   = {slots_wl[i]:i   for i in range(len(slots_wl))}
    intents_dict = {intents_wl[i]:i for i in range(len(intents_wl))}
    return query_dict, slots_dict, intents_dict

def create_ctf_reader(path):
    vocab_size, num_labels, num_intents =  tuple(map(len, build_dicts()))
    res = cntk.io.CTFDeserializer(path, cntk.io.StreamDefs(
        query         = cntk.io.StreamDef(field='S0', shape=vocab_size,  is_sparse=True),
        intent_labels = cntk.io.StreamDef(field='S1', shape=num_intents, is_sparse=True), 
        slot_labels   = cntk.io.StreamDef(field='S2', shape=num_labels,  is_sparse=True)
    ))
    return res



## Build the lookup maps

and do a minor sanity check

In [0]:
qdict, sdict, idict = build_dicts()
iqdict, isdict, iidict = map(lambda d: {d[k]:k for k in d.keys()}, build_dicts())

# check indexes go from 0..len(ndxs)
assert list(map(len, build_dicts())) == list(map(lambda d: 1+max(d.keys()), [iqdict,isdict,iidict]))


## Read the train and test DataSet files
I had a hard time reading the CNTK data into anything, so be compassionate ...

In [0]:
def load_cntk_atis_ds(fname='atis.train.ctf'):
    cr = create_ctf_reader(os.path.join(data_dir, fname))
    mbs = cntk.io.MinibatchSource(cr, randomize=False, max_sweeps=1)

    print('reading ATIS CTF file:',fname)
    input_vars = {}
    input_map = {}
    dims={}
    for key, val in cr['input'].items():
        var = cntk.ops.input_variable(val['dim'], int, name=key)
        input_map[key] = mbs.streams[key]
        input_vars[key] = var
        dims[key] = val['dim']
        print('{:>15}: {}: {:3d} dims'.format(key, val['alias'], val['dim']))

    ds=defaultdict(list)

    count=0
    with tqdm(desc='reading ATIS {}'.format(fname)) as pbar:
        while True:
            mb = mbs.next_minibatch(1,input_map=input_map)
            if len(mb) == 0:
                print("Found {} samples in DS: ".format(count, fname))
                break
            count += 1
            pbar.update(count)

            for name,var in input_vars.items():
                val = mb[name].as_sequences(var)[0].toarray()
                val = val.argmax(axis=-1)
                ds[name].append(val)

    for k,v in ds.items():
        print('{:>13}: {:4d}: {:3d} max seq len | {:3d} dims'.format(k, len(v), max(map(len,v)), dims[k]))
    
    return ds

In [0]:
dicts = {'token_ids':qdict,'slot_ids': sdict, 'intent_ids': idict}
train_ds = load_cntk_atis_ds('atis.train.ctf')
test_ds  = load_cntk_atis_ds('atis.test.ctf')

# check dict sizes
print('dicts',list(map(len,dicts.values())))


## Store as pickle

In [0]:
import gzip, pickle

def store_ds(ds,dicts, fname='ms_cntk_atis.train.pkl.gz'):
    with gzip.open(os.path.join(DATA_DIR, fname), 'wb') as stream:
        pickle.dump((ds,dicts),stream, protocol=pickle.HIGHEST_PROTOCOL)
    print('Done dumping: ', fname)

def load_ds(fname='ms_cntk_atis.train.pkl.gz'):
    with gzip.open(os.path.join(DATA_DIR, fname), 'rb') as stream:
        ds,dicts = pickle.load(stream)
    print('Done  loading: ', fname)
    print('      samples: {:4d}'.format(len(ds['query'])))
    print('   vocab_size: {:4d}'.format(len(dicts['token_ids'])))
    print('   slot count: {:4d}'.format(len(dicts['slot_ids'])))
    print(' intent count: {:4d}'.format(len(dicts['intent_ids'])))
    return ds,dicts

store_ds(train_ds, dicts, 'ms_cntk_atis.train.pkl.gz')
store_ds(test_ds,  dicts, 'ms_cntk_atis.test.pkl.gz')

## Store as text

In [0]:
def store_ds_dicts_to_csvs(dicts, fname='ms_cntk_atis.dict.%.csv.gz'):
    t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids','intent_ids'])
    i2t, i2s, i2in = map(lambda d: {d[k]:k for k in d.keys()}, [t2i,s2i,in2i])    
    
    def store_dict_as_text(fname, items):
        with gzip.open(fname, 'wt') as stream:
            for it in items:
                stream.write('{}\n'.format(it))
        print('done storing:', fname)
        
    store_dict_as_text(fname.replace('%', 'vocab'), map(i2t.get, range(len(i2t))))
    store_dict_as_text(fname.replace('%', 'slots'), map(i2s.get, range(len(i2s))))
    store_dict_as_text(fname.replace('%', 'intent'), map(i2in.get, range(len(i2in))))
    
def store_ds_to_csvs(ds, fname='ms_cntk_atis.%.train.csv.gz'):
    def store_as_text(fname, store_item):
        with gzip.open(fname, 'wt') as stream:
            for i in range(len(ds['query'])):
                store_item(stream, i)
        print('done storing:', fname)
        
    query,intent,slots = map(ds.get, ['query','intent_labels', 'slot_labels'])
    store_as_text(fname.replace('%','query'), 
                  lambda s, i: s.write(' '.join(map(str, query[i])) + '\n'))
    store_as_text(fname.replace('%','slots'), 
                  lambda s, i: s.write(' '.join(map(str, slots[i])) + '\n'))
    store_as_text(fname.replace('%','intent'), 
                  lambda s, i: s.write('{}\n'.format(intent[i][0])))
        
def store_ds_and_dicts_to_csv(ds,dicts, 
                              fname='ms_cntk_atis.train.%.csv.gz', 
                              dname='ms_cntk_atis.dict.%.csv.gz'):
    store_ds_to_csvs(ds,fname)
    store_ds_dicts_to_csvs(dicts,dname)



store_ds_and_dicts_to_csv(train_ds, dicts, 'ms_cntk_atis.train.%.csv.gz')
store_ds_and_dicts_to_csv(test_ds,  dicts, 'ms_cntk_atis.test.%.csv.gz')
   

### Uninstall CNTK

In [0]:
!pip uninstall -y cntk

## Read the pickle

In [0]:
                
train_ds, dicts = load_ds('ms_cntk_atis.train.pkl.gz')
test_ds, dicts  = load_ds('ms_cntk_atis.test.pkl.gz')

## Show  head samples

In [0]:
t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids','intent_ids'])
i2t, i2s, i2in = map(lambda d: {d[k]:k for k in d.keys()}, [t2i,s2i,in2i])
query, slots, intent =  map(train_ds.get, ['query', 'slot_labels', 'intent_labels'])

for i in range(5):
    print('{:4d}:{:>15}: {}'.format(i, i2in[intent[i][0]],
                                    ' '.join(map(i2t.get, query[i]))))
    for j in range(len(query[i])):
        print('{:>33} {:>40}'.format(i2t[query[i][j]],
                                     i2s[slots[i][j]]  ))
    print('*'*74)
                                