In [1]:
import tensorflow as tf
#import keras
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns

import nltk
from nltk.corpus import nps_chat
import pandas as pd 
import re
import pickle
import io
import datetime

import os
from os import makedirs
from os import chdir
from os import path
cwd = os.getcwd()

Using TensorFlow backend.


In [2]:
prop_data = pd.read_csv(cwd + '/vagueCorpora/prop.csv')
speechact_data = pd.read_csv(cwd + '/vagueCorpora/speech.csv')

In [3]:
prop_data

Unnamed: 0,id,text,vague,type
0,1,yeah there is around twenty,1,prop
1,2,i think it was something like two or three,1,prop
2,3,about one hundred,1,prop
3,4,approximately six,1,prop
4,5,thirty four or so,1,prop
...,...,...,...,...
395,396,i have a wireless speaker,0,prop
396,397,this lighter is from spain,0,prop
397,398,my console is load,0,prop
398,399,zoom is better than skype,0,prop


In [4]:
def split_data(text, labels, split_size):
    sentences_train, sentences_test, label_train, label_test = train_test_split(text, labels, test_size = split_size, random_state = 42)
    
    sentences_train = np.array(sentences_train)
    label_train = np.array(label_train)
    sentences_test = np.array(sentences_test)
    label_test = np.array(label_test)
    
    return sentences_train, sentences_test, label_train, label_test

In [5]:
max_length = 200

def make_sequences(tok, train_text, test_text):
    training_sequences = tok.texts_to_sequences(train_text)
    training_padded = pad_sequences(training_sequences, maxlen=max_length, padding = 'post', truncating = 'post')
    
    testing_sequences = tok.texts_to_sequences(test_text)
    testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding = 'post', truncating = 'post')
    
    return training_padded, testing_padded

In [6]:
def clean_text(text):
    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"\?", "", text)
    text = re.sub(r"\!", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"\,", "", text)
    text = re.sub(r"\r\n", "", text)
    return text

In [7]:
speechact_data['id'] = speechact_data['id'].apply(lambda x: x + 400)

VL_dataset = prop_data.append(speechact_data).set_index('id')

type_labels = VL_dataset['type'].tolist()
sentencesRaw = VL_dataset['text'].tolist()
sentences = list(map(clean_text, sentencesRaw))

type_test_size = 0.8
type_train, type_test, type_label_train, type_label_test = split_data(sentences, type_labels, type_test_size)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [8]:
nl_type_train, nl_type_test = [],[]

def number_label(l, new_l):
    for element in l:
        if(element == 'prop'):
            new_l.append(0)
        elif(element == 'speech act'):
            new_l.append(1)
            
number_label(type_label_train, nl_type_train)
number_label(type_label_test, nl_type_test)

nl_type_train, nl_type_test = np.array(nl_type_train), np.array(nl_type_test)

In [9]:
type_tok = Tokenizer(oov_token="<OOV>")
type_tok.fit_on_texts(type_train)
type_word_index = type_tok.word_index
type_vocab_size = len(type_word_index) + 1

In [10]:
type_train_padded, type_test_padded = make_sequences(type_tok, type_train, type_test)

In [11]:
type_train_padded.shape

(100, 200)

In [12]:
type_embedding_dim = 128

type_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(type_vocab_size, type_embedding_dim, 
                                  input_length = max_length, name = 'TYPEembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(512, activation = 'sigmoid'),
        tf.keras.layers.Dense(2, activation = 'sigmoid'),
    ])

type_model.compile(loss = 'sparse_categorical_crossentropy', 
                   optimizer = 'adam', metrics = ['accuracy'])

type_model.summary()

type_history = type_model.fit(type_train_padded, nl_type_train, epochs = 300, 
                              validation_data = (type_test_padded, nl_type_test), 
                              verbose = 2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
TYPEembed (Embedding)        (None, 200, 128)          39552     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               66048     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1026      
Total params: 106,626
Trainable params: 106,626
Non-trainable params: 0
_________________________________________________________________
Epoch 1/300
4/4 - 0s - loss: 0.5956 - accuracy: 0.7900 - val_loss: 0.5043 - val_accuracy: 0.8025
Epoch 2/300
4/4 - 0s - loss: 0.5167 - accuracy: 0.7900 - val_loss: 0.4992 - val_accuracy: 0.8025
Epoch 3/300
4/4 - 0s - loss: 0.5153 - accuracy: 0.7900

Epoch 76/300
4/4 - 0s - loss: 0.4230 - accuracy: 0.7900 - val_loss: 0.4543 - val_accuracy: 0.8025
Epoch 77/300
4/4 - 0s - loss: 0.4182 - accuracy: 0.7900 - val_loss: 0.4520 - val_accuracy: 0.8025
Epoch 78/300
4/4 - 0s - loss: 0.4141 - accuracy: 0.7900 - val_loss: 0.4499 - val_accuracy: 0.8025
Epoch 79/300
4/4 - 0s - loss: 0.4142 - accuracy: 0.7900 - val_loss: 0.4548 - val_accuracy: 0.8025
Epoch 80/300
4/4 - 0s - loss: 0.4094 - accuracy: 0.7900 - val_loss: 0.4489 - val_accuracy: 0.8025
Epoch 81/300
4/4 - 0s - loss: 0.4016 - accuracy: 0.7900 - val_loss: 0.4431 - val_accuracy: 0.8025
Epoch 82/300
4/4 - 0s - loss: 0.3941 - accuracy: 0.7900 - val_loss: 0.4410 - val_accuracy: 0.8025
Epoch 83/300
4/4 - 0s - loss: 0.3919 - accuracy: 0.7900 - val_loss: 0.4373 - val_accuracy: 0.8025
Epoch 84/300
4/4 - 0s - loss: 0.3776 - accuracy: 0.7900 - val_loss: 0.4399 - val_accuracy: 0.8025
Epoch 85/300
4/4 - 0s - loss: 0.3770 - accuracy: 0.7900 - val_loss: 0.4314 - val_accuracy: 0.8025
Epoch 86/300
4/4 - 0

Epoch 159/300
4/4 - 0s - loss: 0.0622 - accuracy: 0.9900 - val_loss: 0.3420 - val_accuracy: 0.8475
Epoch 160/300
4/4 - 0s - loss: 0.0625 - accuracy: 0.9900 - val_loss: 0.3449 - val_accuracy: 0.8525
Epoch 161/300
4/4 - 0s - loss: 0.0609 - accuracy: 0.9900 - val_loss: 0.3546 - val_accuracy: 0.8475
Epoch 162/300
4/4 - 0s - loss: 0.0609 - accuracy: 0.9900 - val_loss: 0.3518 - val_accuracy: 0.8475
Epoch 163/300
4/4 - 0s - loss: 0.0592 - accuracy: 0.9900 - val_loss: 0.3467 - val_accuracy: 0.8500
Epoch 164/300
4/4 - 0s - loss: 0.0569 - accuracy: 0.9900 - val_loss: 0.3437 - val_accuracy: 0.8475
Epoch 165/300
4/4 - 0s - loss: 0.0584 - accuracy: 0.9900 - val_loss: 0.3444 - val_accuracy: 0.8600
Epoch 166/300
4/4 - 0s - loss: 0.0662 - accuracy: 0.9900 - val_loss: 0.3595 - val_accuracy: 0.8400
Epoch 167/300
4/4 - 0s - loss: 0.0644 - accuracy: 0.9900 - val_loss: 0.3447 - val_accuracy: 0.8500
Epoch 168/300
4/4 - 0s - loss: 0.0538 - accuracy: 0.9900 - val_loss: 0.3656 - val_accuracy: 0.8475
Epoch 169/

Epoch 242/300
4/4 - 0s - loss: 0.0218 - accuracy: 0.9900 - val_loss: 0.3880 - val_accuracy: 0.8550
Epoch 243/300
4/4 - 0s - loss: 0.0215 - accuracy: 0.9900 - val_loss: 0.3842 - val_accuracy: 0.8625
Epoch 244/300
4/4 - 0s - loss: 0.0209 - accuracy: 0.9900 - val_loss: 0.3833 - val_accuracy: 0.8625
Epoch 245/300
4/4 - 0s - loss: 0.0216 - accuracy: 0.9900 - val_loss: 0.3944 - val_accuracy: 0.8550
Epoch 246/300
4/4 - 0s - loss: 0.0215 - accuracy: 0.9900 - val_loss: 0.3867 - val_accuracy: 0.8600
Epoch 247/300
4/4 - 0s - loss: 0.0207 - accuracy: 0.9900 - val_loss: 0.3805 - val_accuracy: 0.8525
Epoch 248/300
4/4 - 0s - loss: 0.0193 - accuracy: 1.0000 - val_loss: 0.3886 - val_accuracy: 0.8575
Epoch 249/300
4/4 - 0s - loss: 0.0206 - accuracy: 0.9900 - val_loss: 0.4013 - val_accuracy: 0.8525
Epoch 250/300
4/4 - 0s - loss: 0.0220 - accuracy: 0.9900 - val_loss: 0.3961 - val_accuracy: 0.8575
Epoch 251/300
4/4 - 0s - loss: 0.0203 - accuracy: 0.9900 - val_loss: 0.3805 - val_accuracy: 0.8450
Epoch 252/

## Specialized Classifiers

In [13]:
prop_text = prop_data['text'].tolist()
prop_labels = prop_data['vague'].tolist()
speechact_text = speechact_data['text'].tolist()
speechact_labels = speechact_data['vague'].tolist()

prop_test_size = 0.5
speechact_test_size = 0.1

In [14]:
prop_train, prop_test, prop_label_train, prop_label_test = split_data(prop_text, prop_labels, prop_test_size)
speechact_train, speechact_test, speechact_label_train, speechact_label_test = split_data(speechact_text, speechact_labels, speechact_test_size)

In [15]:
proptok = Tokenizer(oov_token="<OOV>")
proptok.fit_on_texts(prop_train)
prop_word_index = proptok.word_index
prop_vocab_size = len(prop_word_index) + 1

speechacttok = Tokenizer(oov_token="<OOV>")
speechacttok.fit_on_texts(speechact_train)
speechact_word_index = speechacttok.word_index
speechact_vocab_size = len(speechact_word_index) + 1

In [16]:
prop_train_padded, prop_test_padded = make_sequences(proptok, prop_train, prop_test)
speechact_train_padded, speechact_test_padded = make_sequences(speechacttok, speechact_train, speechact_test)

In [17]:
prop_train_padded.shape

(200, 200)

## Prop

In [18]:
prop_embedding_dim = 128

prop_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(prop_vocab_size, prop_embedding_dim, input_length = max_length, name = 'Pembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(256, activation = 'sigmoid'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

prop_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

prop_model.summary()
prop_n_epochs = 100

prop_history = prop_model.fit(prop_train_padded, prop_label_train, epochs = prop_n_epochs, 
                              validation_data = (prop_test_padded, prop_label_test), verbose = 2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Pembed (Embedding)           (None, 200, 128)          64896     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 98,177
Trainable params: 98,177
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
7/7 - 0s - loss: 0.7016 - accuracy: 0.5350 - val_loss: 0.6925 - val_accuracy: 0.5150
Epoch 2/100
7/7 - 0s - loss: 0.7056 - accuracy: 0.4850 - val_loss: 0.6967 - val_accuracy: 0.5150
Epoch 3/100
7/7 - 0s - loss: 0.7001 - accuracy: 0.4950

Epoch 76/100
7/7 - 0s - loss: 0.2112 - accuracy: 0.9450 - val_loss: 0.4515 - val_accuracy: 0.8000
Epoch 77/100
7/7 - 0s - loss: 0.2003 - accuracy: 0.9800 - val_loss: 0.4476 - val_accuracy: 0.7950
Epoch 78/100
7/7 - 0s - loss: 0.2010 - accuracy: 0.9500 - val_loss: 0.4449 - val_accuracy: 0.8000
Epoch 79/100
7/7 - 0s - loss: 0.1907 - accuracy: 0.9700 - val_loss: 0.4761 - val_accuracy: 0.7950
Epoch 80/100
7/7 - 0s - loss: 0.1907 - accuracy: 0.9700 - val_loss: 0.4420 - val_accuracy: 0.8000
Epoch 81/100
7/7 - 0s - loss: 0.1807 - accuracy: 0.9650 - val_loss: 0.4462 - val_accuracy: 0.8000
Epoch 82/100
7/7 - 0s - loss: 0.1783 - accuracy: 0.9800 - val_loss: 0.4517 - val_accuracy: 0.8200
Epoch 83/100
7/7 - 0s - loss: 0.1688 - accuracy: 0.9700 - val_loss: 0.4383 - val_accuracy: 0.7950
Epoch 84/100
7/7 - 0s - loss: 0.1707 - accuracy: 0.9700 - val_loss: 0.4531 - val_accuracy: 0.8100
Epoch 85/100
7/7 - 0s - loss: 0.1610 - accuracy: 0.9750 - val_loss: 0.4382 - val_accuracy: 0.7900
Epoch 86/100
7/7 - 0

## Speech act

In [19]:
speechact_embedding_dim =500

speechact_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(speechact_vocab_size, speechact_embedding_dim, input_length = max_length, name = 'SAembed'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation = 'sigmoid'),
        tf.keras.layers.Dense(1, activation = 'sigmoid')
    ])

speechact_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

speechact_model.summary()
speechact_n_epochs = 100

speechact_history = speechact_model.fit(speechact_train_padded, speechact_label_train, 
                                        epochs = speechact_n_epochs, 
                                        validation_data = (speechact_test_padded, speechact_label_test), 
                                        verbose = 2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
SAembed (Embedding)          (None, 200, 500)          95500     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 500)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                32064     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 127,629
Trainable params: 127,629
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
3/3 - 0s - loss: 0.6975 - accuracy: 0.4444 - val_loss: 0.6930 - val_accuracy: 0.6000
Epoch 2/100
3/3 - 0s - loss: 0.6943 - accuracy: 0.4222 - val_loss: 0.6980 - val_accuracy: 0.4000
Epoch 3/100
3/3 - 0s - loss: 0.6943 - accuracy: 0.51

Epoch 76/100
3/3 - 0s - loss: 0.5137 - accuracy: 0.9556 - val_loss: 0.5382 - val_accuracy: 0.9000
Epoch 77/100
3/3 - 0s - loss: 0.5039 - accuracy: 0.9778 - val_loss: 0.5369 - val_accuracy: 0.9000
Epoch 78/100
3/3 - 0s - loss: 0.4986 - accuracy: 0.9556 - val_loss: 0.5402 - val_accuracy: 0.9000
Epoch 79/100
3/3 - 0s - loss: 0.4905 - accuracy: 0.9556 - val_loss: 0.5298 - val_accuracy: 0.9000
Epoch 80/100
3/3 - 0s - loss: 0.4811 - accuracy: 0.9667 - val_loss: 0.5144 - val_accuracy: 0.9000
Epoch 81/100
3/3 - 0s - loss: 0.4845 - accuracy: 0.9333 - val_loss: 0.5045 - val_accuracy: 1.0000
Epoch 82/100
3/3 - 0s - loss: 0.4695 - accuracy: 0.9556 - val_loss: 0.5075 - val_accuracy: 0.9000
Epoch 83/100
3/3 - 0s - loss: 0.4675 - accuracy: 0.9333 - val_loss: 0.5184 - val_accuracy: 0.9000
Epoch 84/100
3/3 - 0s - loss: 0.4570 - accuracy: 0.9444 - val_loss: 0.4961 - val_accuracy: 0.9000
Epoch 85/100
3/3 - 0s - loss: 0.4445 - accuracy: 0.9667 - val_loss: 0.4833 - val_accuracy: 0.9000
Epoch 86/100
3/3 - 0

In [20]:
def type_classifier(text):
    input_sequence = type_tok.texts_to_sequences([text])
    input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')
    
    if text == "": return
    
    prediction_list = type_model.predict(input_padded)
    prediction_location = np.argmax(prediction_list)
    
    type_list = ['propositional', 'speech act']

    return(type_list[prediction_location])

In [21]:
Pthreshold = 0.5
SAthreshold = 0.5

def vague_classifier(txt,Type):
    text = clean_text(txt)
    
    if(Type == 'propositional'):
        input_sequence = proptok.texts_to_sequences([text])
        input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')
        if text == "": return

        predicted_probability = prop_model.predict(input_padded)[0][0]

        if(predicted_probability <= Pthreshold):
            prediction = "not vague"
        elif(predicted_probability > Pthreshold):
            prediction = "vague"
            
    elif(Type == 'speech act'):
        input_sequence = speechacttok.texts_to_sequences([text])
        input_padded = pad_sequences(input_sequence, maxlen=max_length, padding = 'post', truncating = 'post')
        
        input_padded = np.array(input_padded[0])
        
        if text == "": return

        predicted_probability = speechact_model.predict(input_padded)[0][0]

        if(predicted_probability <= SAthreshold):
            prediction = "not vague"
        elif(predicted_probability > SAthreshold):
            prediction = "vague"
    
    print("INPUT: " + text)
    print("Type: " + Type)
    print("prediction: " + prediction)
    print(str(predicted_probability) + " activation")

In [22]:
def Krieger(text): return vague_classifier(text,type_classifier(text))

In [23]:
Krieger("Could you open this for me?")

INPUT: could you open this for me
Type: propositional
prediction: not vague
0.15203702 activation


### Tensorboard

In [24]:
%load_ext tensorboard

In [25]:
!rm -rf ./logs/ 

In [26]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [27]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

type_history = type_model.fit(type_train_padded, nl_type_train, epochs = 300, 
                              validation_data = (type_test_padded, nl_type_test), 
                              verbose = 2, callbacks=[tensorboard_callback])

Epoch 1/300
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
4/4 - 0s - loss: 0.0109 - accuracy: 1.0000 - val_loss: 0.4172 - val_accuracy: 0.8500
Epoch 2/300
4/4 - 0s - loss: 0.0109 - accuracy: 1.0000 - val_loss: 0.4135 - val_accuracy: 0.8550
Epoch 3/300
4/4 - 0s - loss: 0.0102 - accuracy: 1.0000 - val_loss: 0.4100 - val_accuracy: 0.8500
Epoch 4/300
4/4 - 0s - loss: 0.0099 - accuracy: 1.0000 - val_loss: 0.4079 - val_accuracy: 0.8475
Epoch 5/300
4/4 - 0s - loss: 0.0099 - accuracy: 1.0000 - val_loss: 0.4077 - val_accuracy: 0.8425
Epoch 6/300
4/4 - 0s - loss: 0.0138 - accuracy: 1.0000 - val_loss: 0.4504 - val_accuracy: 0.8150
Epoch 7/300
4/4 - 0s - loss: 0.0195 - accuracy: 1.0000 - val_loss: 0.4211 - val_accuracy: 0.8250
Epoch 8/300
4/4 - 0s - loss: 0.0102 - accuracy: 1.0000 - val_loss: 0.4250 - val_accuracy: 0.8500
Epoch 9/300
4/4 - 0s - loss: 0.0105 - accuracy: 1.0000 - val_loss: 0.4707 - val_accuracy: 0.8500
Epoch 10/300
4/4 - 0s - loss: 0.0148 - accuracy: 0.9900

Epoch 80/300
4/4 - 0s - loss: 0.0050 - accuracy: 1.0000 - val_loss: 0.4402 - val_accuracy: 0.8425
Epoch 81/300
4/4 - 0s - loss: 0.0050 - accuracy: 1.0000 - val_loss: 0.4405 - val_accuracy: 0.8425
Epoch 82/300
4/4 - 0s - loss: 0.0049 - accuracy: 1.0000 - val_loss: 0.4401 - val_accuracy: 0.8425
Epoch 83/300
4/4 - 0s - loss: 0.0049 - accuracy: 1.0000 - val_loss: 0.4408 - val_accuracy: 0.8425
Epoch 84/300
4/4 - 0s - loss: 0.0048 - accuracy: 1.0000 - val_loss: 0.4416 - val_accuracy: 0.8425
Epoch 85/300
4/4 - 0s - loss: 0.0048 - accuracy: 1.0000 - val_loss: 0.4427 - val_accuracy: 0.8425
Epoch 86/300
4/4 - 0s - loss: 0.0048 - accuracy: 1.0000 - val_loss: 0.4445 - val_accuracy: 0.8425
Epoch 87/300
4/4 - 0s - loss: 0.0047 - accuracy: 1.0000 - val_loss: 0.4447 - val_accuracy: 0.8425
Epoch 88/300
4/4 - 0s - loss: 0.0047 - accuracy: 1.0000 - val_loss: 0.4456 - val_accuracy: 0.8425
Epoch 89/300
4/4 - 0s - loss: 0.0048 - accuracy: 1.0000 - val_loss: 0.4502 - val_accuracy: 0.8475
Epoch 90/300
4/4 - 0

Epoch 163/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4727 - val_accuracy: 0.8425
Epoch 164/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4716 - val_accuracy: 0.8425
Epoch 165/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4716 - val_accuracy: 0.8425
Epoch 166/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4726 - val_accuracy: 0.8425
Epoch 167/300
4/4 - 0s - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.4747 - val_accuracy: 0.8425
Epoch 168/300
4/4 - 0s - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.4798 - val_accuracy: 0.8400
Epoch 169/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4812 - val_accuracy: 0.8375
Epoch 170/300
4/4 - 0s - loss: 0.0027 - accuracy: 1.0000 - val_loss: 0.4815 - val_accuracy: 0.8375
Epoch 171/300
4/4 - 0s - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.4762 - val_accuracy: 0.8425
Epoch 172/300
4/4 - 0s - loss: 0.0026 - accuracy: 1.0000 - val_loss: 0.4732 - val_accuracy: 0.8425
Epoch 173/

Epoch 246/300
4/4 - 0s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.5069 - val_accuracy: 0.8400
Epoch 247/300
4/4 - 0s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.5038 - val_accuracy: 0.8425
Epoch 248/300
4/4 - 0s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.5025 - val_accuracy: 0.8425
Epoch 249/300
4/4 - 0s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.5004 - val_accuracy: 0.8425
Epoch 250/300
4/4 - 0s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.5016 - val_accuracy: 0.8425
Epoch 251/300
4/4 - 0s - loss: 0.0019 - accuracy: 1.0000 - val_loss: 0.5045 - val_accuracy: 0.8500
Epoch 252/300
4/4 - 0s - loss: 0.0020 - accuracy: 1.0000 - val_loss: 0.5011 - val_accuracy: 0.8475
Epoch 253/300
4/4 - 0s - loss: 0.0016 - accuracy: 1.0000 - val_loss: 0.5018 - val_accuracy: 0.8425
Epoch 254/300
4/4 - 0s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.5086 - val_accuracy: 0.8400
Epoch 255/300
4/4 - 0s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.5159 - val_accuracy: 0.8400
Epoch 256/

In [28]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6008 (pid 13723), started 1 day, 2:39:29 ago. (Use '!kill 13723' to kill it.)

## Exports

In [29]:
Pweights = prop_model.get_layer('Pembed').get_weights()[0]
SAweights = speechact_model.get_layer('SAembed').get_weights()[0]

In [30]:
prop_vocab = prop_word_index
Pout_v = io.open('Pvecs.tsv', 'w', encoding='utf-8')
Pout_m = io.open('Pmeta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(prop_vocab):
    if num == 0: continue
    vec = Pweights[num]
    Pout_m.write(word + "\n")
    Pout_v.write('\t'.join([str(x) for x in vec]) + "\n")
Pout_v.close()
Pout_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('Pvecs.tsv')
    files.download('Pmeta.tsv')
    
##################################

speechact_vocab = speechact_word_index
SAout_v = io.open('SAvecs.tsv', 'w', encoding='utf-8')
SAout_m = io.open('SAmeta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(speechact_vocab):
    if num == 0: continue
    vec = SAweights[num]
    SAout_m.write(word + "\n")
    SAout_v.write('\t'.join([str(x) for x in vec]) + "\n")
SAout_v.close()
SAout_m.close()

try:
    from google.colab import files
except ImportError:
    pass
else:
    files.download('SAvecs.tsv')
    files.download('SAmeta.tsv')

In [31]:
def update1(df, word, column):
    loc = df.loc[df['Words']==word]
    locI = int(loc.index.tolist()[0])
    new_value = int(loc[column]) + 1
    new_v_series = pd.Series([new_value], name=column, index=[locI])
    df.update(new_v_series)
    return df

def binDecode(x, thres):
    if (x < thres):
        return 0
    else:
        return 1

def Count_Correct(model, test_data, test_padded, test_label, word_index, thres):
    predicted_labels = model.predict(test_padded)
    words = list(word_index.keys())
    
    df = pd.DataFrame(columns = ['Words', 'Correct', 'Total', 'Rightly Vague'])
    df['Words'] = words
    df['Correct'] = [0]*len(word_index)
    df['Total'] = [0]*len(word_index)
    df['Rightly Vague'] = [0]*len(word_index)
    df['Incorrectly Vague'] = [0]*len(word_index)
    
    for i in range(len(test_data)):
        boo = (binDecode(float(predicted_labels[i]), thres) == binDecode(float(test_label[i]), thres))
        element = test_data[i]
        for word in element.split():
            if (word in (df.Words.values)):
                word = word
            else: word = "<OOV>"
            df = update1(df, word, 'Total')
            if boo:
                df = update1(df, word, 'Correct')
                if (binDecode(predicted_labels[i], thres) == 1):
                    df = update1(df, word, 'Rightly Vague') 
            else:
                if (binDecode(predicted_labels[i], thres) == 1):
                    df = update1(df, word, 'Incorrectly Vague')
    return(df)

In [32]:
Propdf = Count_Correct(prop_model, prop_test, prop_test_padded, prop_label_test, prop_word_index, Pthreshold)

In [33]:
SpeechActdf = Count_Correct(speechact_model, speechact_test, speechact_test_padded, speechact_label_test, speechact_word_index, SAthreshold)

In [36]:
SpeechActdf

Unnamed: 0,Words,Correct,Total,Rightly Vague,Incorrectly Vague
0,i,8.0,8.0,7.0,0
1,the,1.0,2.0,0.0,0
2,to,2.0,2.0,1.0,0
3,that,2.0,2.0,2.0,0
4,you,2.0,3.0,2.0,0
...,...,...,...,...,...
185,wants,0.0,0.0,0.0,0
186,orange,0.0,0.0,0.0,0
187,pass,0.0,0.0,0.0,0
188,salt,0.0,0.0,0.0,0


## Export

In [34]:
chdir(cwd)
if (os.path.exists(cwd+"/ModelsVL")):
    print("File exists")
else:
    makedirs("ModelsVL")
    chdir(cwd + "/ModelsVL")

    type_model.save('type_model.h5')
    prop_model.save('prop_model.h5')
    speechact_model.save('speechact_model.h5')

    with open('type_tok.pickle', 'wb') as handle:
        pickle.dump(type_tok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('proptok.pickle', 'wb') as handle:
        pickle.dump(proptok, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('speechacttok.pickle', 'wb') as handle:
        pickle.dump(speechacttok, handle, protocol=pickle.HIGHEST_PROTOCOL)

File exists


In [35]:
chdir(cwd)
if (os.path.exists(cwd+"/VLdf")):
    print("File exists")
else:
    makedirs("VLdf")
    chdir(cwd + "/VLdf")
    
    Propdf.to_csv("Propdf.csv")
    SpeechActdf.to_csv("SpeechActdf.csv")

File exists
