In [1]:
import os
import json

In [2]:
# Clone https://github.com/sonos/nlu-benchmark to the same level as nlu folder
DATA_PATH = "../nlu-benchmark/2017-06-custom-intent-engines"
DEFAULT_SLOT_LABEL = "unspecified"

In [3]:
cls_label_set = set()
weather_seq_label_set = set()

cls_train_input = []
cls_train_label = []
weather_seq_train_input = []
weather_seq_train_label = []

cls_val_input = []
cls_val_label = []
weather_seq_val_input = []
weather_seq_val_label = []


def read_data(subdir, domain, dataset):
    cls_input, cls_label, seq_input, seq_label = [],[],[],[]
    with open(os.path.join(subdir, "train_{}.json".format(domain))) as domain_file:
        domain_data = json.load(domain_file)
        domain_items = domain_data[domain]
        for item in domain_items:
            sentence_tokens = []
            sentence_labels = []
            for part in item['data']:
                tokens = part['text'].strip().split()
                sentence_tokens += tokens
                for token in tokens:
                    if 'entity' in part:
                        sentence_labels.append(part['entity'])
                    else:
                        sentence_labels.append(DEFAULT_SLOT_LABEL)
            cls_input.append(sentence_tokens)
            cls_label.append(domain)
            
            assert len(sentence_tokens) == len(sentence_labels)
            seq_input.append(sentence_tokens)
            seq_label.append(sentence_labels)
    print("Domain:{} Size:{}".format(domain, len(cls_input)))
    return cls_input, cls_label, seq_input, seq_label
                
for domain in os.listdir(DATA_PATH):
    subdir = os.path.join(DATA_PATH, domain)
    if os.path.isdir(subdir):
        cls_label_set.add(domain)
        cls_input, cls_label, seq_train_input, seq_train_label = \
        read_data(subdir, domain, 'train')
        cls_train_input += cls_input
        cls_train_label += cls_label
        cls_input, cls_label, seq_val_input, seq_val_label = \
        read_data(subdir, domain, 'validate')
        cls_val_input += cls_input
        cls_val_label += cls_label
        if domain == 'GetWeather':
            weather_seq_train_input = seq_train_input
            weather_seq_train_label = seq_train_label
            weather_seq_val_input = seq_val_input
            weather_seq_val_label = seq_val_label
            
            for seq_label in weather_seq_train_label:
                weather_seq_label_set |= set(seq_label)
            
            for seq_label in weather_seq_val_label:
                weather_seq_label_set |= set(seq_label)

Domain:PlayMusic Size:300
Domain:PlayMusic Size:300
Domain:RateBook Size:300
Domain:RateBook Size:300
Domain:SearchCreativeWork Size:300
Domain:SearchCreativeWork Size:300
Domain:GetWeather Size:300
Domain:GetWeather Size:300
Domain:BookRestaurant Size:300
Domain:BookRestaurant Size:300
Domain:AddToPlaylist Size:300
Domain:AddToPlaylist Size:300
Domain:SearchScreeningEvent Size:300
Domain:SearchScreeningEvent Size:300


In [4]:
print(' '.join(cls_train_input[0]), '--->' , cls_train_label[0])
for token, label in zip(weather_seq_train_input[0], weather_seq_train_label[0]):
    print(token, '--->', label)

I'd like to hear music that's popular from Trick-trick on the Slacker service ---> PlayMusic
How's ---> unspecified
the ---> unspecified
weather ---> unspecified
in ---> unspecified
Munchique ---> geographic_poi
National ---> geographic_poi
Natural ---> geographic_poi
Park ---> geographic_poi


In [5]:
print(cls_label_set)
print(weather_seq_label_set)

{'BookRestaurant', 'AddToPlaylist', 'PlayMusic', 'RateBook', 'GetWeather', 'SearchCreativeWork', 'SearchScreeningEvent'}
{'spatial_relation', 'geographic_poi', 'state', 'current_location', 'condition_description', 'condition_temperature', 'city', 'country', 'timeRange', 'unspecified'}


In [6]:
import keras
import numpy as np

def vectorization(train_x, train_y, validate_x, validate_y, label_list, task="classification"):
    word2ind = {}
    ind2word = {}
    specialtokens = ['<pad>','<unk>'] 
    
    def addword(word2ind, ind2word, word):
        if word in word2ind:
            return
        ind2word[len(word2ind)] = word
        word2ind[word] = len(word2ind)

    for token in specialtokens:
        addword(word2ind, ind2word, token)

    for sent in train_x:
        for word in sent:
            addword(word2ind, ind2word, word)

    train_x_ids = []
    for sent in train_x:
        indsent = [word2ind.get(i, word2ind['<unk>']) for i in sent]
        train_x_ids.append(indsent)

    train_x_ids = np.array(train_x_ids, dtype=object)

    validate_x_ids = []
    for sent in validate_x:
        indsent = [word2ind.get(i, word2ind['<unk>']) for i in sent]
        validate_x_ids.append(indsent)

    validate_x_ids = np.array(validate_x_ids, dtype=object)
    train_x_ids = keras.preprocessing.sequence.pad_sequences(train_x_ids, maxlen=64, padding='post',value=word2ind['<pad>'])
    validate_x_ids = keras.preprocessing.sequence.pad_sequences(validate_x_ids, maxlen=64, padding='post',value=word2ind['<pad>'])
    
    train_y_ids = []
    validate_y_ids = []
    if task == "classification":
        for label in train_y:
            train_y_ids.append(label_list.index(label))
        for label in validate_y:
            validate_y_ids.append(label_list.index(label))
        train_y_ids = np.array(train_y_ids)
        validate_y_ids = np.array(validate_y_ids)
    elif task == "slot_labeling":
        for seq_labels in train_y:
            seq_label_ids = []
            for label in seq_labels:
                seq_label_ids.append(label_list.index(label))
            train_y_ids.append(seq_label_ids)
        for seq_labels in validate_y:
            seq_label_ids = []
            for label in seq_labels:
                seq_label_ids.append(label_list.index(label))
            validate_y_ids.append(seq_label_ids)
        train_y_ids = keras.preprocessing.sequence.pad_sequences(train_y_ids, maxlen=64, padding='post',value=label_list.index('unspecified'))            
        validate_y_ids = keras.preprocessing.sequence.pad_sequences(validate_y_ids, maxlen=64, padding='post',value=label_list.index('unspecified'))            
    
    return word2ind, train_x_ids, train_y_ids, validate_x_ids, validate_y_ids

In [7]:
cls_train_label[0:5]

['PlayMusic', 'PlayMusic', 'PlayMusic', 'PlayMusic', 'PlayMusic']

In [8]:
cls_label_list = list(cls_label_set)
cls_word2ind, cls_train_x_ids, cls_train_y_ids, \
    cls_validate_x_ids, cls_validate_y_ids = \
                vectorization(cls_train_input, cls_train_label, \
                      cls_val_input, cls_val_label, cls_label_list, task="classification")

In [9]:
weather_seq_label_list = list(weather_seq_label_set)
weather_seq_label_list.remove('unspecified')
weather_seq_label_list.insert(0, 'unspecified')
weather_seq_word2ind, weather_seq_train_x_ids, weather_seq_train_y_ids, \
    weather_seq_validate_x_ids, weather_seq_validate_y_ids = \
                vectorization(weather_seq_train_input, weather_seq_train_label, \
                      weather_seq_val_input, weather_seq_val_label, weather_seq_label_list, task="slot_labeling")

weather_seq_sample_weight = np.ones(weather_seq_train_y_ids.shape)
for i, seq in enumerate(weather_seq_train_y_ids):
    for j, label in enumerate(seq):
        if label == weather_seq_label_list.index('unspecified'):
            weather_seq_sample_weight[i][j] = 0.1

In [10]:
import pickle

cls_data = {
    'word2ind': cls_word2ind,
    'label_list': cls_label_list,
    'train_x': cls_train_x_ids,
    'train_y': cls_train_y_ids,
    'val_x': cls_validate_x_ids,
    'val_y': cls_validate_y_ids
}

weather_seq_data = {
    'word2ind': weather_seq_word2ind,
    'label_list': weather_seq_label_list,
    'train_x': weather_seq_train_x_ids,
    'train_y': weather_seq_train_y_ids,
    'sample_weight': weather_seq_sample_weight,
    'val_x': weather_seq_validate_x_ids,
    'val_y': weather_seq_validate_y_ids    
}

pickle.dump(cls_data, open('cls_data.pickle', 'wb'))
pickle.dump(weather_seq_data, open('weather_seq_data.pickle', 'wb'))

In [11]:
import requests

In [12]:
url = "http://api.openweathermap.org/data/2.5/weather?q=London&appid=421126fbad51c268744e7cfece50779f"

In [13]:
ret = requests.get(url)

In [14]:
import json

In [15]:
response = json.loads(ret.text)

In [16]:
response

{'coord': {'lon': -0.1257, 'lat': 51.5085},
 'weather': [{'id': 800,
   'main': 'Clear',
   'description': 'clear sky',
   'icon': '01d'}],
 'base': 'stations',
 'main': {'temp': 283.88,
  'feels_like': 283.33,
  'temp_min': 281.61,
  'temp_max': 285.92,
  'pressure': 1017,
  'humidity': 89},
 'visibility': 8000,
 'wind': {'speed': 0.45, 'deg': 102, 'gust': 1.79},
 'clouds': {'all': 5},
 'dt': 1622523603,
 'sys': {'type': 2,
  'id': 2019646,
  'country': 'GB',
  'sunrise': 1622519344,
  'sunset': 1622578079},
 'timezone': 3600,
 'id': 2643743,
 'name': 'London',
 'cod': 200}