In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers import Input, Dense
from keras.models import Sequential
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

In [2]:
import json
with open('data_full.json') as json_file:
    CLINC150 = json.load(json_file)
CLINC150_train=CLINC150['train']
CLINC150_test=CLINC150['test']
CLINC150_val=CLINC150['val']

In [3]:
classes=['insurance',
 'next_holiday',
 'repeat',
 'credit_limit_change',
 'book_hotel',
 'yes',
 'damaged_card',
 'rewards_balance',
 'time',
 'pto_balance',
 'interest_rate',
 'change_volume',
 'taxes',
 'sync_device',
 'traffic',
 'what_song',
 'shopping_list',
 'todo_list_update',
 'order_checks',
 'shopping_list_update']

In [4]:
train_data=[]
test_data=[]
val_data=[]

In [5]:
for c in CLINC150_train:
    if c[1] in classes:
        train_data.append(c)

In [6]:
for c in CLINC150_test:
    if c[1] in classes:
        test_data.append(c)

In [7]:
for c in CLINC150_val:
    if c[1] in classes:
        val_data.append(c)

In [8]:
df = pd.DataFrame(train_data)
df.to_csv('train_data.csv', index=False,)
df_train=pd.read_csv('train_data.csv')
print(len(df_train))
df_train.head()

2000


Unnamed: 0,0,1
0,"what time is it in punta gorda, florida",time
1,"what time is it in glenwood springs, co",time
2,"what time is it in fredericksburg, tx",time
3,"what time is it in las vegas, nv",time
4,"what time is it in houston, tx",time


In [9]:
df = pd.DataFrame(val_data)
df.to_csv('val_data.csv', index=False)
df_val=pd.read_csv('val_data.csv')
print(len(df_val))
df_val.head()

400


Unnamed: 0,0,1
0,what time is it in france,time
1,what's the time in london right now,time
2,what hour is it in london,time
3,what's the time,time
4,what is the time in london,time


In [10]:
df = pd.DataFrame(test_data)
df.to_csv('test_data.csv', index=False,)
df_test=pd.read_csv('test_data.csv')
print(len(df_test))
df_test.head()

600


Unnamed: 0,0,1
0,i need you to tell me what time it is in new y...,time
1,"what time is it in adelaide, australia right now",time
2,is it after noon,time
3,is it six o clock yet,time
4,please give me the time in tanzania at this mo...,time


In [11]:
words = set(stopwords.words("english"))

In [12]:
df_train['text'] = df_train['0'].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
df_val['text'] = df_val['0'].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))
df_test['text'] = df_test['0'].apply(lambda x: ' '.join([word for word in x.split() if word not in (words)]))

In [13]:
df_train['text'] = df_train['text'].str.replace('\d+', '')
df_val['text'] = df_val['text'].str.replace('\d+', '')
df_test['text'] = df_test['text'].str.replace('\d+', '')

In [14]:
df_val.head()

Unnamed: 0,0,1,text
0,what time is it in france,time,time france
1,what's the time in london right now,time,what's time london right
2,what hour is it in london,time,hour london
3,what's the time,time,what's time
4,what is the time in london,time,time london


In [15]:
text = df_train['text']
labels = df_train['1']

val_text = df_val['text']
val_labels = df_val['1']

test_text = df_test['text']
test_labels = df_test['1']

In [16]:
labels.nunique()

20

In [17]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(text)
word_index = tok.word_index

In [18]:
len(text)

2000

In [19]:
max_vocab_size = len(word_index) + 1

In [20]:
train_data_tokens = tok.texts_to_sequences(text)
val_data_tokens = tok.texts_to_sequences(val_text)
test_data_tokens = tok.texts_to_sequences(test_text)

In [21]:
train_data_tokens

[[6, 561, 562, 322],
 [6, 563, 410, 564],
 [6, 565, 411],
 [6, 412, 278, 566],
 [6, 567, 411],
 [5, 9, 41, 6, 223, 87],
 [29, 7, 41, 6, 279, 87],
 [85, 9, 41, 6, 223, 87],
 [7, 6, 223, 87],
 [323, 9, 41, 6, 279, 87],
 [6, 56, 568, 87],
 [7, 41, 6],
 [41, 6, 324],
 [5, 9, 6, 322],
 [6, 56],
 [413, 6, 224, 6, 56],
 [7, 6, 56],
 [7, 41, 6, 225, 87],
 [97, 6, 414],
 [41, 415, 6],
 [7, 6, 2],
 [6, 153],
 [6, 569],
 [6, 29, 416, 56],
 [10, 6, 325],
 [10, 6, 154, 322, 56],
 [6, 116, 417],
 [6, 56, 326],
 [10, 6, 56],
 [41, 6, 2],
 [29, 418, 570, 6],
 [117, 6],
 [2, 85, 9, 6],
 [280, 6, 56, 571, 6, 224],
 [6],
 [6, 56, 572, 6, 224],
 [280, 6],
 [6, 327, 39, 573, 6, 224],
 [280, 6, 574, 6, 224],
 [6, 575, 87],
 [6, 327],
 [6],
 [6, 576],
 [6],
 [6, 577, 578],
 [6, 579],
 [10, 41, 6, 580],
 [10, 41, 6, 415],
 [6, 225, 581, 56],
 [328, 6, 6, 413],
 [6],
 [56],
 [582, 583],
 [6, 56, 324],
 [6, 279, 226, 87],
 [6, 225, 226, 87],
 [6, 584, 226, 87],
 [6, 223, 226, 87],
 [6, 419, 226, 87],
 [6],
 [28

In [22]:
max_len=0
word=0
for t in train_data_tokens:
    if len(t)>max_len:
        max_len=len(t)
#         word=t
max_len

12

In [23]:
input_length = 12

In [24]:
text[235]

'would like total rewards points balance bank america platinum rewards card, please'

In [25]:
train_input = pad_sequences(train_data_tokens, input_length)
val_input = pad_sequences(val_data_tokens, input_length)
test_input = pad_sequences(test_data_tokens, input_length)

In [26]:
train_input[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   6, 563, 410, 564])

In [27]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(labels)

LabelEncoder()

In [28]:
labels = label_transformer.transform(labels)
val_labels = label_transformer.transform(val_labels)
test_labels = label_transformer.transform(test_labels)

In [29]:
labels = to_categorical(np.asarray(labels))
val_labels = to_categorical(np.asarray(val_labels))
test_labels = to_categorical(np.asarray(test_labels))

In [30]:
labels[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0.], dtype=float32)

In [31]:
X_train, X_val, y_train, y_val = train_input,val_input,labels,val_labels

In [32]:
X_train[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   6, 563, 410, 564])

In [33]:
y_train[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0.], dtype=float32)

In [34]:
embedded_dim = 300
embedded_index = dict()

with open('glove.6B/glove.6B.300d.txt', 'r', encoding='utf-8') as glove:
    for line in glove:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedded_index[word] = vector

In [35]:
glove.close

<function TextIOWrapper.close()>

In [36]:
embedded_matrix = np.zeros((max_vocab_size, embedded_dim))
for x, i in word_index.items():
    vector = embedded_index.get(x)
    if vector is not None:
        embedded_matrix[i] = vector

In [37]:
model = Sequential()
model.add(Embedding(max_vocab_size, 300, input_length=input_length, weights=[embedded_matrix], trainable=False))

In [38]:
model.add(Conv1D(filters=32, kernel_size=8, activation='selu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(30, activation='selu'))
model.add(Dense(20, activation='sigmoid'))

In [39]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12, 300)           348600    
_________________________________________________________________
conv1d (Conv1D)              (None, 5, 32)             76832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2, 32)             0         
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 30)                1950      
_________________________________________________________________
dense_1 (Dense)              (None, 20)                620       
Total params: 428,002
Trainable params: 79,402
Non-trainable params: 348,600
_____________________________________________

In [40]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, verbose=2)

Epoch 1/10
63/63 - 1s - loss: 1.9408 - accuracy: 0.5555
Epoch 2/10
63/63 - 0s - loss: 0.8399 - accuracy: 0.8665
Epoch 3/10
63/63 - 0s - loss: 0.4803 - accuracy: 0.9095
Epoch 4/10
63/63 - 0s - loss: 0.3298 - accuracy: 0.9260
Epoch 5/10
63/63 - 0s - loss: 0.2551 - accuracy: 0.9385
Epoch 6/10
63/63 - 0s - loss: 0.2137 - accuracy: 0.9475
Epoch 7/10
63/63 - 0s - loss: 0.1890 - accuracy: 0.9525
Epoch 8/10
63/63 - 0s - loss: 0.1673 - accuracy: 0.9545
Epoch 9/10
63/63 - 0s - loss: 0.1557 - accuracy: 0.9590
Epoch 10/10
63/63 - 0s - loss: 0.1455 - accuracy: 0.9605


<tensorflow.python.keras.callbacks.History at 0x15e6fc20b50>

In [41]:
model.evaluate(X_val, y_val)



[0.5886355638504028, 0.8349999785423279]

In [42]:
def acc(y_true, y_pred):
    return np.equal(np.argmax(y_true, axis=-1), np.argmax(y_pred, axis=-1)).mean()

In [43]:
predictions = model.predict(test_input)

In [44]:
print(acc(test_labels, predictions))

0.8466666666666667
