In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [2]:
DIM_EMBEDDING = 100
EPOCHS = 100
GLOVE = "../Embeddings/glove.6B.100d.txt"

train_path = r"C:\Users\jorda\Documents\UCT\Maths and Applied Maths\MAM3040W\Project\mrs-processing\data\extracted\train"
hike_path = train_path + r"\hike.tags"

In [3]:
def create_mask(input, mask_value=0):
    mask = np.zeros(input.shape)
    mask = input != -1
    print(mask)
    return mask

In [4]:
def tag_word_separator(input):
    tokens = input.replace("[", " [ ").replace("]", " ] ").split()

    out_tokens = []
    out_tags = []

    index = 0

    while index < len(tokens):
        # get the tag
        if tokens[index] == "[":
            index += 1
            out_tags.append(tokens[index])
        index += 1
        # get the token
        if tokens[index] == "]":
            index += 1
            phrase = tokens[index]
            index += 1
            # make sure that we capture any phrases as well
            while index < len(tokens) and tokens[index] != "[":
                phrase += " " + tokens[index]
                index += 1
            out_tokens.append(phrase)

    return [out_tokens, out_tags]

In [5]:
def get_sentence_vector(sentence, word_to_id_dict):
    output = [word_to_id_dict[word] for word in sentence]
    return np.array(output)

In [6]:
def tag_distribution_vector(tag, tag_to_id_dict):
    output = np.zeros(len(tag_to_id_dict))
    output[tag_to_id_dict[tag]] = 1
    return output

In [7]:
train = []
test = []
dev = []

with open(hike_path, "r+") as file:
    for line in file:
        train.append(tag_word_separator(line.strip()))

In [8]:
# Set Up Indices
# this allows us to move from a word to its index in the word_list and from an index to its word

# this will correspond to the words from the sentences
id_to_token = [] # put in the appropriate tokens here
token_to_id = {} # set up the associated token to id index (must correspond with the id_to_token)

# this will correspond to the span labels
id_to_tag = []
tag_to_id = {}

for tokens, labels in train + test + dev:
    for token in tokens:
        if token not in token_to_id:
            token_to_id[token] = len(token_to_id)
            id_to_token.append(token)

    for label in labels:
        if label not in tag_to_id:
            tag_to_id[label] = len(tag_to_id)
            id_to_tag.append(label)

NWORDS = len(id_to_token)
NTAGS = len(id_to_tag)

In [9]:
# get GloVe embeddings
pretrained = {}
for line in open(GLOVE, "r", encoding="utf8"):
    parts = line.strip().split()
    word = parts[0]
    pretrained[word] = parts[1:]

pretrained_list = []
scale = np.sqrt(3.0 / DIM_EMBEDDING)
for word in id_to_token:
    if word.lower() in pretrained:
        vector = [float(v) for v in pretrained[word.lower()]]
        pretrained_list.append(np.array(vector))
    else:
        random_vector = np.random.uniform(-scale, scale, [DIM_EMBEDDING])
        pretrained_list.append(random_vector)

assert len(pretrained_list) == NWORDS

In [10]:
for i in range(len(train)):
    tokens = train[i][0]
    tags = train[i][1]

    sequence_array = np.array(get_sentence_vector(tokens, token_to_id))

    tag_dist = np.array(
        [tag_distribution_vector(tag, tag_to_id) for tag in tags]
    )
    # convert the tags to the probability distribution form
    train[i] = [sequence_array, tag_dist]

In [11]:
x_train = np.array([inp[0] for inp in train])
y_train = np.array([inp[1] for inp in train])
# x_test = np.asarray(x_test)
# y_test = np.asarray(y_test)

# x_train = tf.convert_to_tensor([inp[0] for inp in train])
# x_test = np.asarray(x_test)
# y_train = tf.convert_to_tensor([inp[1] for inp in train])
# y_test = np.asarray(y_test)

# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

(324,)
(324,)


In [14]:
x_train = keras.preprocessing.sequence.pad_sequences(
    x_train,
    value=0
)
y_train = keras.preprocessing.sequence.pad_sequences(
    y_train,
    value=0
)

In [15]:
x_train = tf.convert_to_tensor(x_train)
y_train = tf.convert_to_tensor(y_train)

In [16]:
# print(x_train)
# print(y_train)

tf.Tensor(
[[   0    0    0 ...    6    7    8]
 [   0    0    0 ...    6    7    8]
 [   0    0    0 ...    6    7    8]
 ...
 [   0    0    0 ...   23  762 1042]
 [   0    0    0 ...   15  762 1042]
 [   0    0    0 ...  293   23 1051]], shape=(324, 35), dtype=int32)
tf.Tensor(
[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 

In [17]:
mask = create_mask(x_train)

tf.Tensor(
[[ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 ...
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]
 [ True  True  True ...  True  True  True]], shape=(324, 35), dtype=bool)


In [18]:
lstm_hidden = 64

In [19]:
model = keras.Sequential()

# Layers:

#   Input — sequential
# Define the inputs
#
#   Embedding — What dimension input vectors
#             — embeddings will consist of a word vector and a PoS embedding concatenated (add this in later)
# Layers to parse the sentence
glove_init = keras.initializers.Constant(np.array(pretrained_list))
model.add(layers.Embedding(
        input_dim=NWORDS,
        output_dim=DIM_EMBEDDING,
        trainable=False,
        embeddings_initializer=glove_init,
        mask_zero=True
    ))
# include the character level embeddings
# probably use a cnn for this

#   BLSTM — bidirectional LSTM
#         — How many hidden layers and units
#         — Only output after the whole span has been parsed, not at every input
model.add(
    layers.Bidirectional(layers.LSTM(lstm_hidden, return_sequences=True))
)
# Now we just need to get access to the lstm_output values to create the span encodings


#   Dense — What does this do?
#         — Will this actually just be the output layer?
#         — Number of units
model.add(layers.Dense(NTAGS))

In [20]:
# model.build()

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         105200    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         84480     
_________________________________________________________________
dense (Dense)                (None, None, 477)         61533     
Total params: 251,213
Trainable params: 146,013
Non-trainable params: 105,200
_________________________________________________________________


In [24]:
# Define/choose loss function
# Define/choose gradient method
model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"]
)

In [34]:
history = model.fit(
    # x_train, y_train, validation_data=(x_test, y_test), epochs=1
    x_train, y_train, epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
