In [1]:
from tensorflow.keras.models import load_model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import brown, treebank, conll2000

brown_sent = brown.tagged_sents(tagset='universal')
tree_sent = treebank.tagged_sents(tagset='universal')
conll_sent = conll2000.tagged_sents(tagset='universal')
all_sent = brown_sent + tree_sent + conll_sent
pos = [[pos[1] for pos in tup] for tup in all_sent] # store the corresponding pos tag
pos_tokenizer = Tokenizer()
pos_tokenizer.fit_on_texts(pos)
pos_seqs = pos_tokenizer.texts_to_sequences(pos)

f = open('data/data_normal.txt')
lines = f.readlines()
data = []
for line in lines:
    tokens = line.split()
    tokens =  [t.lower() for t in tokens]
    data.append(tokens)

f_out = open('data/labels_normal.txt')
lines_out = f_out.readlines()
labels = []
for line in lines_out:
    tokens = line.split()
    labels.append(tokens)
    
f = open('data/data_garden.txt')
lines = f.readlines()
garden_data = []
for line in lines:
    tokens = line.split()
    tokens =  [t.lower() for t in tokens]
    garden_data.append(tokens)

f_out = open('data/labels_garden.txt')
lines_out = f_out.readlines()
garden_labels = []
for line in lines_out:
    tokens = line.split()
    garden_labels.append(tokens)

2023-04-09 16:15:42.219599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def accuracy(preds, labels):
    accuracies = []
    for i in range(len(preds)):
        actual = labels[i]
        predict = preds[i]
        acc = 0
        for j in range(len(predict)):
            try:
                if predict[j] == actual[j]:
                    acc += 1
            except:
                print('Line:', i)
                print('Predict:', len(predict))
                print('Actual:', len(actual))
        if len(preds[i]) > 0:
            acc = acc / len(preds[i])
        accuracies.append(acc)
    return accuracies

In [3]:
def prediction(model, sent_type):
    if sent_type == 'normal':
        pred = model.predict(normal_pd)
        pred_vec = np.argmax(pred, axis =-1)
        label = np.argmax(normal_pos_pd,axis =-1)
        pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
        inds = np.where((pred_flat == actual_flat) & (pred_flat != 0))
        actual_length = len((np.where(actual_flat != 0))[0])
        print(len(inds[0])/len(pred_flat))
    else:
        pred = model.predict(garden_pd)
        pred_vec = np.argmax(pred, axis =-1)
        label = np.argmax(garden_pos_pd,axis =-1)
        pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
        inds = np.where((pred_flat == actual_flat) & (pred_flat != 0))
        actual_length = len((np.where(actual_flat != 0))[0])
        print(len(inds[0])/len(pred_flat))

In [10]:
max_len = 10

normal_tokenizer = Tokenizer()
garden_tokenizer = Tokenizer()
normal_tokenizer.fit_on_texts(data)
garden_tokenizer.fit_on_texts(garden_data)
normal = normal_tokenizer.texts_to_sequences(data)
garden = garden_tokenizer.texts_to_sequences(garden_data)
normal_pd = pad_sequences(normal, max_len, padding='post', truncating='post')
garden_pd = pad_sequences(garden, max_len, padding='post', truncating='post')

pos_tokenizer = Tokenizer()
pos_tokenizer.fit_on_texts(labels)
pos_tokenizer.fit_on_texts(garden_labels)
normal_pos = pos_tokenizer.texts_to_sequences(labels)
garden_pos = pos_tokenizer.texts_to_sequences(garden_labels)
normal_pos_pd = pad_sequences(normal_pos, max_len, padding='post', truncating='post')
garden_pos_pd = pad_sequences(garden_pos, max_len, padding='post', truncating='post')
normal_pos_pd = to_categorical(normal_pos_pd, num_classes=13)
garden_pos_pd = to_categorical(garden_pos_pd, num_classes=13)

In [79]:
MASKED = load_model('lstm_lr0.0005_bs128_pd100_e20.h5')

In [80]:
MASKED_res = MASKED.evaluate(normal_pd, normal_pos_pd)
MASKED_resGarden = MASKED.evaluate(garden_pd, garden_pos_pd)



In [83]:
pred = MASKED.predict(normal_pd) 
pred_vec = np.argmax(pred, axis =-1)
label = np.argmax(normal_pos_pd,axis =-1)
pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
inds = np.where((pred_flat == actual_flat) & (pred_flat != 0))
actual_length = len((np.where(actual_flat != 0))[0])
# print(pred_vec[:5]) 
# print(label[:5])
print(len(inds[0])/actual_length)
# 
all_posttags = []
for p in pred:
    predseq = [np.argmax(pred, axis=-1) for pred in p]
    pred_tags = [pos_tokenizer.sequences_to_texts([[i]])[0].upper() for i in predseq]
    all_posttags.append(pred_tags)
print('LSTM Model with Mask True')
print('Normal Sentence: "The old man rode the boat"')
print("Prediction: ", all_posttags[:1][:6][0][:7])
print('Actual: ', labels[0])


pred = MASKED.predict(garden_pd) 
pred_vec = np.argmax(pred, axis =-1)
label = np.argmax(garden_pos_pd,axis =-1)
pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
inds = np.where((pred_flat == actual_flat) & (pred_flat != 0))
actual_length = len((np.where(actual_flat != 0))[0])
# print(pred_vec[:5]) 
# print(label[:5])
print(len(inds[0])/actual_length)

all_posttags = []
for p in pred:
    predseq = [np.argmax(pred, axis=-1) for pred in p]
    pred_tags = [pos_tokenizer.sequences_to_texts([[i]])[0].upper() for i in predseq]
    all_posttags.append(pred_tags)
print('Garden Path Sentence: "The old man the boat"')
print("Prediction: ", all_posttags[:1][0][:6])
print('Actual: ', garden_labels[0])

0.07482993197278912
LSTM Model with Mask True
Normal Sentence: "The old man rode the boat"
Prediction:  ['ADJ', 'VERBS', 'ADV', '.', 'ADJ', 'ADJ', 'DET']
Actual:  ['DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'NOUN', '.']
0.033112582781456956
Garden Path Sentence: "The old man the boat"
Prediction:  ['ADJ', 'DET', '.', 'ADJ', 'DET', 'DET']
Actual:  ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', '.']


In [11]:
lr01_pd10 = load_model('lstm_lr0.1_bs128_p10_e20_sgd.h5')
lr001_pd10 = load_model('lstm_lr0.01_bs128_p10_e20_sgd.h5')
lr0001_pd10 = load_model('lstm_lr0.001_bs128_p10_e20_sgd.h5')
lr01_pd10_res = lr01_pd10.evaluate(normal_pd, normal_pos_pd)
lr001_pd10_res = lr001_pd10.evaluate(normal_pd, normal_pos_pd)
lr0001_pd10_res = lr0001_pd10.evaluate(normal_pd, normal_pos_pd)
lr01_pd10_resGarden = lr01_pd10.evaluate(garden_pd, garden_pos_pd)
lr001_pd10_resGarden = lr001_pd10.evaluate(garden_pd, garden_pos_pd)
lr0001_pd10_resGarden = lr0001_pd10.evaluate(garden_pd, garden_pos_pd)



In [6]:
prediction(lr01_pd10, 'normal')
prediction(lr001_pd10, 'normal')
prediction(lr0001_pd10, 'normal')
prediction(lr01_pd10, 'garden')
prediction(lr001_pd10, 'garden')
prediction(lr0001_pd10, 'garden')

0.09523809523809523
0.11904761904761904
0.18095238095238095
0.06666666666666667
0.11428571428571428
0.2


In [55]:
lr01_pd50 = load_model('lstm_lr0.1_bs128_p50_e20_sgd.h5')
lr001_pd50 = load_model('lstm_lr0.01_bs128_p50_e20_sgd.h5')
lr0001_pd50 = load_model('lstm_lr0.001_bs128_p50_e20_sgd.h5')
lr01_pd50_res = lr01_pd50.evaluate(normal_pd, normal_pos_pd)
lr001_pd50_res = lr001_pd50.evaluate(normal_pd, normal_pos_pd)
lr0001_pd50_res = lr0001_pd50.evaluate(normal_pd, normal_pos_pd)
lr01_pd50_resGarden = lr01_pd50.evaluate(garden_pd, garden_pos_pd)
lr001_pd50_resGarden = lr001_pd50.evaluate(garden_pd, garden_pos_pd)
lr0001_pd50_resGarden = lr0001_pd50.evaluate(garden_pd, garden_pos_pd)



In [84]:
lr01_pd100 = load_model('lstm_lr0.1_bs128_p100_e20_sgd.h5')
lr001_pd100 = load_model('lstm_lr0.01_bs128_p100_e20_sgd.h5')
lr0001_pd100 = load_model('lstm_lr0.001_bs128_p100_e20_sgd.h5')
lr01_pd100_res = lr01_pd100.evaluate(normal_pd, normal_pos_pd)
lr001_pd100_res = lr001_pd100.evaluate(normal_pd, normal_pos_pd)
lr0001_pd100_res = lr0001_pd100.evaluate(normal_pd, normal_pos_pd)
lr01_pd100_resGarden = lr01_pd100.evaluate(garden_pd, garden_pos_pd)
lr001_pd100_resGarden = lr001_pd100.evaluate(garden_pd, garden_pos_pd)
lr0001_pd100_resGarden = lr0001_pd100.evaluate(garden_pd, garden_pos_pd)



In [12]:
pred = lr0001_pd10.predict(normal_pd) 
pred_vec = np.argmax(pred, axis =-1)
label = np.argmax(normal_pos_pd,axis =-1)
pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
inds = np.where((pred_flat == actual_flat) & (pred_flat != 0))

# print(len(inds[0])/len(pred_flat))

all_posttags = []
for p in pred:
    predseq = [np.argmax(pred, axis=-1) for pred in p]
    pred_tags = [pos_tokenizer.sequences_to_texts([[i]])[0].upper() for i in predseq]
    all_posttags.append(pred_tags)
print('LSTM Model with Mask False Padding 10')
print('Normal Sentence: "The old man rode the boat"')
print("Prediction: ", all_posttags[:1][0][:6])
print('Actual: ', labels[0])


pred = lr0001_pd10.predict(garden_pd) 
pred_vec = np.argmax(pred, axis =-1)
label = np.argmax(garden_pos_pd,axis =-1)
pred_flat, actual_flat = pred_vec.flatten(), label.flatten()
inds = np.where((pred_flat == actual_flat) & (actual_flat != 0))
# print(pred_vec) 
# print(label)
# print(len(inds[0])/len(pred_flat))

all_posttags = []
for p in pred:
    predseq = [np.argmax(pred, axis=-1) for pred in p]
    pred_tags = [pos_tokenizer.sequences_to_texts([[i]])[0].upper() for i in predseq]
    all_posttags.append(pred_tags)
print('Garden Path Sentence: "The old man the boat"')
print("Prediction: ", all_posttags[:1][0][:6])
print('Actual: ', garden_labels[0])

LSTM Model with Mask False Padding 10
Normal Sentence: "The old man rode the boat"
Prediction:  ['NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN']
Actual:  ['DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'NOUN', '.']
Garden Path Sentence: "The old man the boat"
Prediction:  ['NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN']
Actual:  ['DET', 'NOUN', 'VERB', 'DET', 'NOUN', '.']


In [94]:
test = np.where(label== 1)
len_test = len(test[0])/len((np.where(actual_flat != 0))[0])
print(len_test)

0.2781456953642384


In [104]:
print(len(test[0]))
print(pred.shape)

42
(21, 10, 13)
