In [23]:
import json
with open("./data/train-data_all/fr.dev.json", "r") as file_handler:
    dataset = json.load(file_handler)

In [24]:
sgns = [i['char'] for i in dataset]
glosses = [i['gloss'] for i in dataset]

In [25]:
# Import DATA
with open("./data/lemm2Idx.json") as outfile:
    lemm2Idx = json.load(outfile)

with open("./data/idx2Lemm.json") as outfile:
    idx2Lemm = json.load(outfile)

with open("./data/lemmatized_glosses.json") as outfile:
    lemmatized_glosses = json.load(outfile)

In [26]:
lemmatized_glosses_text = []

for gloss in lemmatized_glosses:
    text = ""
    for word in gloss:
        text += idx2Lemm[str(word)] + " "
    lemmatized_glosses_text.append(text)

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input

model = Sequential()
model.add(Input(shape=(1,), dtype="string"))

In [28]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

max_tokens = len(lemm2Idx)
#max_len = max([len(i) for i in lemmatized_glosses_text])
max_len = 50

vectorize_layer = TextVectorization(
    # Max vocab size. Any words outside of the max_tokens most common ones
    # will be treated the same way: as "out of vocabulary" (OOV) tokens.
    max_tokens=max_tokens,
    # Output integer indices, one per string token
    output_mode="int",
    # Always pad or truncate to exactly this many tokens
    output_sequence_length=max_len,
)

In [29]:
import numpy as np
# Call adapt(), which fits the TextVectorization layer to our text dataset.
# This is when the max_tokens most common words (i.e. the vocabulary) are selected.
vectorize_layer.adapt(np.array(list(idx2Lemm.values())))

In [30]:
model.add(vectorize_layer)

In [31]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Flatten

# Note that we're using max_tokens + 1 here, since there's an
# out-of-vocabulary (OOV) token that gets added to the vocab.
model.add(Embedding(max_tokens + 1, 64))

model.add(Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

model.add(LSTM(units=256, return_sequences=False, dropout=0.5, recurrent_dropout=0.5))

#model.add(TimeDistributed(Dense(256, activation="relu")))

model.add(Dense(256))

In [32]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_2 (TextV  (None, 50)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 50, 64)            554176    
                                                                 
 bidirectional_2 (Bidirectio  (None, 50, 256)          197632    
 nal)                                                            
                                                                 
 lstm_5 (LSTM)               (None, 256)               525312    
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
Total params: 1,342,912
Trainable params: 1,342,912
No

In [33]:
import pandas as pd
import numpy as np
results = pd.DataFrame()
results['with_add_lstm'] = model.fit(lemmatized_glosses_text, sgns, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
