# Prediction of a missing word

# Load libraries

In [None]:
#!cd /home/oem2/Documents/Github_analysis_PROJECTS/Using_word_embeddings

In [None]:
#!dotenv list

In [85]:
import os
from os import environ
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# https://pypi.org/project/python-dotenv/
from dotenv import load_dotenv, find_dotenv
dot_env_file_exist = load_dotenv(find_dotenv()) # read local .env file

# Returns true or false if .env exists in current directory
print('dot_env_file_exist: ', dot_env_file_exist)  


from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

dot_env_file_exist:  False


# Subfunctions

In [2]:
def get_next_break_char_index(text_temp):
    temp = [text_temp.find('\n'), text_temp.find('.')]
    out = np.sort(temp)
    # print('out: ', out)
    
    foundvals = [i for i in out if i > -1]
    if any(foundvals):
        ender = min(foundvals)
    else:
        ender = len(text_temp)  # no values found
    # print('ender: ', ender)
    
    return ender

In [3]:
# Transform plain character text to a sentence array, where each sentence is in a nested array.
def text_2_sen(text0):
    flag = 0
    ender = -1
    ender_b4 = ender+1
    text_temp = text0 # a temporary variable to not copy over text0
    
    sen = []
    while ender_b4 < len(text_temp):
        ender_b4 = ender+1
        # print('ender_b4: ', ender_b4)
        
        ender = get_next_break_char_index(text_temp[ender_b4::]) + ender_b4
        # print('ender: ', ender)
        
        sen.append(text_temp[ender_b4:ender])
        # print('sen: ', sen)
    
    # Remove empty nested arrays 
    sen = [i for i in sen if any(i)]
    # print('sen: ', sen)

    return sen

In [4]:
def count_the_number_of_times_a_char_appears(text, char2find):
    c = 0
    for char in text:
        if char == char2find:
            c = c + 1
    # print('c: ', c)
    return c

In [5]:
def remove_text_from_start_end_marker(sentence):

    start_marker = ['(', '{', '[']
    end_marker = [')', '}', ']']

    clean_sen = sentence
    
    for ind in range(len(start_marker)): 
        # print('start_marker[ind]: ', start_marker[ind])

        # Count the number of times the marker appears
        loops = count_the_number_of_times_a_char_appears(clean_sen, start_marker[ind])

        for x in range(loops):
            if start_marker[ind] in clean_sen and end_marker[ind] in clean_sen:
                start_ind = clean_sen.find(start_marker[ind])
                # print('start_ind: ', start_ind)
                
                end_ind = clean_sen.find(end_marker[ind])
                # print('end_ind: ', end_ind)
    
                if start_ind == 0:
                    clean_sen = clean_sen[end_ind+1::]
                else:
                    clean_sen = clean_sen[0:start_ind-1] + clean_sen[end_ind+1::]
            
            # print('clean_sen: ', clean_sen)
    
    return clean_sen

In [None]:
def decode_y(selected_y, Y_seq, y):
    # --------------------------------
    # Decode y
    # --------------------------------
    # label_tokenizer.sequences_to_texts(Y_train_seq[1:2])
    str_in_an_array = [str(i) for i in Y_seq[selected_y:selected_y+1]]
    y_word_tflookup = label_tokenizer.sequences_to_texts(str_in_an_array)[0]
    # print('y_word_tflookup: ', y_word_tflookup)
    
    # OR

    y_word_ylookup = y[selected_y]
    # print('y_word_ylookup: ', y_word_ylookup)
    # --------------------------------

    return y_word_ylookup

In [None]:
def decodetext2seq_predict_decodeseq2text(txt_input, tokenizer, MAXLEN, PADDING, model, label_tokenizer, Y_seq, y):
    # Converts text to the created sequence via the tokenizer object
    token_list = tokenizer.texts_to_sequences([txt_input])[0]
    print('token_list: ', token_list)
    
    # Pad the sequence
    token_list = pad_sequences([token_list], maxlen=MAXLEN, padding=PADDING)
    
    # Predict the last word: outputs the probabilities of a word being selected
    probabilities = model.predict(token_list, verbose=0)
    # print('probabilities: ', probabilities)
    # print('len(probabilities): ', len(np.ravel(probabilities)))
    
    # Select the highest probable word : This picks the best lexical match, but maybe not the best grammatic match
    selected_y = np.argmax(probabilities)
    # print('selected_y: ', selected_y)
    
    y_word = decode_y(selected_y, Y_seq, y)
    
    # Select the best 5  probable word :
    sorted_prob = np.sort(np.ravel(probabilities))  # sorted in ascending order
    sorted_prob = sorted_prob[::-1]  # sorted in descending order
    sorted_prob = sorted_prob[0:10]  # Take first 10 of highes probabilities
    print('sorted_prob: ', sorted_prob)
    
    selected_y_list = np.argsort(np.ravel(probabilities)) # sorting in ascending order
    selected_y_list = selected_y_list[::-1]  # sorted in descending order
    selected_y_list = selected_y_list[0:10]  # Take first 10 of highes probabilities
    print('selected_y_list: ', selected_y_list)

    y_word_list = []
    for i in selected_y_list:
        y_word_list.append(decode_y(i, Y_seq, y))

    return y_word, y_word_list, probabilities

In [None]:
def evaluate_word_selection(probabilities, Y_seq, y):
    # Evaluating the word selection
    probabilities = np.ravel(probabilities)
    
    df = pd.DataFrame()
    for i in range(len(Y_seq)):
        str_in_an_array = [str(i) for i in Y_seq[i:i+1]]
        y_word = label_tokenizer.sequences_to_texts(str_in_an_array)
        df_temp = pd.DataFrame([y_word[0], y[i], Y_seq[i], probabilities[i]]).T
        df = pd.concat([df, df_temp], axis=0)
    
    df.columns = ['y_word_tflookup', 'y_word_ylookup', 'Y_seq', 'probabilities']
    df = df.sort_values(by='probabilities', ascending=False)
    df.reset_index(drop=True, inplace=True)  # inplace means keep index inplace, drop means to include the index as a column

    return df

# Load the text

In [6]:
# https://pypi.org/project/Wikipedia-API/
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia('MyProjectName', 'en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

p_wiki = wiki_wiki.page("Breakfast")
text = p_wiki.text
print('text: ', text)

text:  Breakfast is the first meal of the day usually eaten in the morning. The word in English refers to breaking the fasting period of the previous night. Various "typical" or "traditional" breakfast menus exist, with food choices varying by regions and traditions worldwide.

History
In Old English, a regular morning meal was called morgenmete, and the word dinner, which originated from Gallo-Romance desjunare ("to break one's fast"), referred to a meal after fasting. Around mid-13 century, that meaning of dinner faded away, and around 15th century "breakfast" came into use in written English to describe a morning meal.

Ancient breakfast
Ancient Egypt
In Ancient Egypt, peasants ate a daily meal, most likely in the morning, consisting of soup, beer, bread, and onions before they left for work in the fields or work commanded by the pharaohs.The traditional breakfast believed to have been cooked in ancient Egypt was fūl (made from fava beans, possibly the ancestor of today's ful medame

# Convert the text to sentences

In [88]:
def convert_text2_sentences(text):
    
    sen = text_2_sen(text)
    # print('sen: ', sen)
    
    # --------------------------------
    # Add additonal made-up sentences
    # --------------------------------
    sen.append('for breakfast it is typical to drink orange juice and eat eggs')
    sen.append('for breakfast the most common drink for breakfast is orange juice')
    sen.append('breakfast time is often from sunrise to a few hours before lunchtime')
    sen.append('for breakfast the most widely sold and bought drink is orange juice and milk')
    # print('sen: ', sen)
    
    # --------------------------------
    # Clean the sentences
    # --------------------------------
    # Make sentences lowercase
    sen1 = [i.lower() for i in sen]
    
    # Remove parentheses and text in between parentheses, so that phrases are gramatically correct
    sen2 = [remove_text_from_start_end_marker(i) for i in sen1]
    # print('sen2: ', sen2)
    
    # Split the sentences on a comma to make more short and clear sentences
    sen3 = []
    for i in sen2:
        temp = i.split(',')
        for j in temp:
            sen3.append(j)
    # print('sen3: ', sen3)
    
    # Remove undesireable characters 
    to_replace = ["!", ";", '\n', '</p>', '<a', 'id=', "href=", 'title=', 'class=', '</a>', '(', ')', '}', '{',
                  '</sup>', '<p>', '</b>', '<sup', '>', '<', '\\', '-']
    replace_with = ''
    
    sen4 = []
    for i in sen3:
        word_array = i.split()
        # print('word_array: ', word_array)
    
        word_array_new = []
        for wind, word in enumerate(word_array):
            # print('word: ', word)
            
            out = word # initialization
            
            for ind, to_replace_val in enumerate(to_replace):
                # print('to_replace_val: ', to_replace_val)
                out_b4 = out
                out = word.replace(to_replace_val, replace_with)
    
                # Take the shortest out to ensure previous changes are stored
                if len(out_b4) < len(out):
                    out = out_b4
                # print('out: ', out)
            
            # Stores the last changed word    
            word_array_new.append(out)
            
        sen4.append(' '.join(word_array_new))   
    # print('sen4: ', sen4)
    
    
    # --------------------------------
    # Remove sentences with less than 10 words. Narrow the sentences down to realistic sentences.
    # --------------------------------
    sen5 = [i for i in sen4 if len(i.split()) > 10]
    # print('sen5: ', sen5)

    return sen5

# Create y : remove the last word in each sentence 

In [90]:
sentences = convert_text2_sentences(text)

X = []
y = []

for i in sentences:
    words = i.split()
    temp = words[-1]

    # The last word needs to be at least 3 characters long
    if len(temp) > 3 and temp.isnumeric() != True:
        X.append(' '.join(words[0:-1]))  # it automatically removes spaces before and after the sentence
        y.append(temp)
        
print("X: ", X)
print("y: ", y)

X:  ['breakfast is the first meal of the day usually eaten in the', 'the word in english refers to breaking the fasting period of the previous', 'and around 15th century "breakfast" came into use in written english to describe a morning', 'and onions before they left for work in the fields or work commanded by the', "the opening prose of the 16th book of the odyssey mentions breakfast as the meal being prepared in the morning before attending to one's", 'a meal called akratisma was typically consumed immediately after rising in the', 'the earliest attested references on tagēnias are in the works of the 5th century bc poets cratinus and', '1st century latin poet martial said that ientaculum was eaten at 3:00 or 4:00 in the', 'it seems unlikely that any fixed time was truly assigned for this', 'made from roasted spelt wheat or barley that was then pounded and cooked in a cauldron of', 'monarchs and their entourages would spend a lot of time around a table for', 'only two formal meals wer

## Create train and test datasets

In [54]:
X = np.array(X)
y = np.array(y)
print("X.shape: ", X.shape)
print("y.shape: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)

X.shape:  (70,)
y.shape:  (70,)
X_train.shape:  (56,)
X_test.shape:  (14,)
y_train.shape:  (56,)
y_test.shape:  (14,)


## Encode sentences

In [55]:
vocab_size = 7000  # Desired number of vocabolary words that you want in the "word dictionary". Tokenize assigns 
                    # a number to each new word up to this value.
oov_tok = "<OOV>"   # Text to replace 'out of vocabulary' words

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
# tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X) #  (BEST)

# Print the length of the word index. This is the number of vocabulary words in the dictionary.
word_index = tokenizer.word_index
print(f'number of words in word_index: {len(word_index)}')

# Print the word index
print(f'word_index: {word_index}')

number of words in word_index: 441
word_index: {'<OOV>': 1, 'the': 2, 'of': 3, 'and': 4, 'in': 5, 'breakfast': 6, 'a': 7, 'to': 8, 'was': 9, 'that': 10, 'for': 11, 'is': 12, 'it': 13, 'meal': 14, 'were': 15, 'first': 16, 'as': 17, 'by': 18, 'before': 19, 'have': 20, 'with': 21, 'eaten': 22, 'morning': 23, 'or': 24, 'century': 25, 'consumed': 26, 'time': 27, 'who': 28, 'bread': 29, 'an': 30, 'coffee': 31, 'during': 32, 'eat': 33, 'most': 34, 'cereals': 35, 'usually': 36, 'into': 37, 'they': 38, 'said': 39, 'this': 40, 'from': 41, 'their': 42, 'would': 43, 'not': 44, 'energy': 45, 'been': 46, 'had': 47, 'common': 48, 'consume': 49, 'cold': 50, 'chocolate': 51, 'became': 52, 'maple': 53, 'processed': 54, 'health': 55, 'cereal': 56, 'children': 57, 'juice': 58, 'history': 59, 'drink': 60, 'orange': 61, 'day': 62, 'english': 63, 'refers': 64, 'around': 65, 'came': 66, 'use': 67, 'work': 68, '16th': 69, 'being': 70, 'after': 71, 'are': 72, 'at': 73, '00': 74, 'truly': 75, 'made': 76, 'cooked

In [56]:
# Desired length of sequences
MAXLEN = 120  # Pick something small 
# OU
# MAXLEN = len(word_index)  # Total length of sequences

PADDING = 'post'  # OR 'pre'

# -----------------

#  (BEST)
X_seq = tokenizer.texts_to_sequences(X)

# Pad the sequences using the post padding strategy
X_seq = pad_sequences(X_seq, maxlen=MAXLEN, padding=PADDING, truncating=PADDING)

print(f"First padded sequence looks like this: \n\n{X_seq[0]}\n")
print(f"Numpy array of all sequences has shape: {X_seq.shape}\n")
print(f"This means there are {X_seq.shape[0]} sequences in total and each one has a size of {X_seq.shape[1]}")

# -----------------

# X_train_seq = tokenizer.texts_to_sequences(X_train)

# Pad the sequences using the post padding strategy
# X_train_seq = pad_sequences(X_train_seq, maxlen=MAXLEN, padding=PADDING, truncating=PADDING)

# print(f"First padded sequence looks like this: \n\n{X_train_seq[0]}\n")
# print(f"Numpy array of all sequences has shape: {X_train_seq.shape}\n")
# print(f"This means there are {X_train_seq.shape[0]} sequences in total and each one has a size of {X_train_seq.shape[1]}")

# -----------------

X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences using the post padding strategy
X_test_seq = pad_sequences(X_test_seq, maxlen=MAXLEN, padding=PADDING, truncating=PADDING)

print(f"First padded sequence looks like this: \n\n{X_test_seq[0]}\n")
print(f"Numpy array of all sequences has shape: {X_test_seq.shape}\n")
print(f"This means there are {X_test_seq.shape[0]} sequences in total and each one has a size of {X_test_seq.shape[1]}")


First padded sequence looks like this: 

[ 6 12  2 16 14  3  2 62 36 22  5  2  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]

Numpy array of all sequences has shape: (70, 120)

This means there are 70 sequences in total and each one has a size of 120
First padded sequence looks like this: 

[ 51   4  31  15 102   8 270 271   5   2   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 

In [57]:
# Tokenize_labels : ONLY if labels are text 


# -----------------
# Way 0: using y_train
# -----------------
# Instantiate the Tokenizer (no additional arguments needed)
# num_words = len(y_train)
# label_tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")

# Fit the tokenizer on all the labels
# label_tokenizer.fit_on_texts(y_train)  # I used all of y, so it can learn words from both y_train and y_test

# -----------------
# Way 1: using y
# -----------------
# Instantiate the Tokenizer (no additional arguments needed)  (BEST)
num_words = len(y)
label_tokenizer = Tokenizer(num_words=num_words)

# Fit the tokenizer on all the labels
label_tokenizer.fit_on_texts(y)  # I used all of y, so it can learn words from both y_train and y_test

# -----------------

# Convert labels to sequences (BEST)
Y_seq = label_tokenizer.texts_to_sequences(y)

# Convert sequences to a numpy array. Don't forget to substact 1 from every entry in the array!
Y_seq = np.array([i-1 for i in np.array(Y_seq)])

# -----------------

# Convert labels to sequences
# Y_train_seq = label_tokenizer.texts_to_sequences(y_train)

# Convert sequences to a numpy array. Don't forget to substact 1 from every entry in the array!
# Y_train_seq = np.array([i-1 for i in np.array(Y_train_seq)])

# -----------------

# Convert labels to sequences
Y_test_seq = label_tokenizer.texts_to_sequences(y_test)

# Convert sequences to a numpy array. Don't forget to substact 1 from every entry in the array!
Y_test_seq = np.array([i-1 for i in np.array(Y_test_seq)])

In [58]:
# Review size of data
# -----------------

#  (BEST)
X_seq = np.array(X_seq)
print('X_seq.shape: ', X_seq.shape)

Y_seq = np.reshape(Y_seq, (len(Y_seq),))
print('Y_seq.shape: ', Y_seq.shape)
print('max(Y_seq): ', max(Y_seq))

# -----------------

# X_train_seq = np.array(X_train_seq)
# print('X_train_seq.shape: ', X_train_seq.shape)

# Y_train_seq = np.reshape(Y_train_seq, (len(Y_train_seq),))
# print('Y_train_seq.shape: ', Y_train_seq.shape)
# print('max(Y_train_seq): ', max(Y_train_seq))

# -----------------

# X_test_seq = np.array(X_test_seq)
# print('X_test_seq.shape: ', X_test_seq.shape)

# Y_test_seq = np.reshape(Y_test_seq, (len(Y_test_seq),))
# print('Y_test_seq.shape: ', Y_test_seq.shape)
# print('max(Y_test_seq): ', max(Y_test_seq))

X_seq.shape:  (70, 120)
Y_seq.shape:  (70,)
max(Y_seq):  54


## Build the model

In [59]:
def text_model(n_a, input_dim, output_dim, input_length, return_sequences, n_outputs, loss_function_type):

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(n_a, return_sequences=return_sequences)))
    # model.add(LSTM(n_a, input_shape=(timesteps_train, feature), return_sequences=return_sequences, return_state=return_state))
    
    model.add(tf.keras.layers.Flatten())
    
    
    # Types of W initializer :
    initializer = tf.keras.initializers.HeUniform()

    model.add(tf.keras.layers.Dense(n_outputs, activation='softmax', kernel_initializer=initializer))

    # Compile the model for training
    opt = tf.keras.optimizers.Adam(learning_rate=0.01)
    # opt = keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
    # opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

    # Select a loss function
    if loss_function_type == 'binary':
        # Number of input are 
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])  # optimizer='adam'
    elif loss_function_type == 'categorical':  
        # When to use categorical_crossentropy?
        # says categorical_crossentropy work when y is in one-hot form - but this gave an error
        model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])  # optimizer='adam'
    else:
        model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['acc'])
    
    model.summary()

    return model


## Specify model compile settings

In [60]:
input_dim=vocab_size     # this is the Size of the vocabulary. You put this instead of word_index, because
# you specify this as the max size for the Tokenizer.

# ---------------

embedding_dim = 16  # Desired output length of the model layer 
output_dim=embedding_dim      #  Dimension of the dense embedding, the size that you want the output layer to be
                              # or the dimension of the vector space for each word

# ---------------

# Length of maximum sentence sequences (subject to text)
input_length=MAXLEN  # Desired length of sequences

# ---------------

n_a = 32  # hidden layer size
return_sequences = True  # obtain an output for each timestep for each batch

# ---------------

# n_outputs = len(Y_train_seq)
n_outputs = len(Y_seq)

# ---------------

# 2D tensor with shape: (batch_size, input_length)
# 3D tensor with shape: (batch_size, input_length, output_dim)

## Specify callbacks

In [61]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

# Well stop if there is no improvement after 5 epochs, OR if the accuracy reaches 0.9
early_stoping_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', baseline=0.90)

## Compile the model

In [62]:
loss_function_type = "sparse"
model = text_model(n_a, input_dim, output_dim, input_length, return_sequences, n_outputs, loss_function_type)

# Train the model
# Need to compile the model everytime one wants to retrain the model, other wise it will train the 
# model starting with the final weights

# -----------------
# Way 0: using X_train and y_train
# -----------------
# model.fit(X_train_seq, Y_train_seq, validation_data=(X_test_seq, Y_test_seq), epochs=60, callbacks=[tensorboard_callback])

# -----------------
# Way 1: using X and y
# -----------------
model.fit(X_seq, Y_seq, validation_data=(X_test_seq, Y_test_seq), epochs=60, callbacks=[tensorboard_callback])


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           112000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 120, 64)           12544     
 onal)                                                           
                                                                 
 flatten_1 (Flatten)         (None, 7680)              0         
                                                                 
 dense_1 (Dense)             (None, 70)                537670    
                                                                 
Total params: 662214 (2.53 MB)
Trainable params: 662214 (2.53 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/60


  output, from_logits = _get_logits(


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.src.callbacks.History at 0x7f39bdc692d0>

## Test the model

In [79]:
which_test = 1

# --------------------------------

if which_test == 0:
    # Test 0: test with made-up sentences
    txt_input = 'for breakfast I drink'
    txt_input = 'for breakfast I eat'
    # txt_input = 'I eat breakfast at'

else:
    # Test 1: test with sentences from X_train
    # Decode the first sequence using the Tokenizer class
    num = 4
    out = tokenizer.sequences_to_texts(X_seq[num:num+1])
    print('out: ', out)
    
    a = out[0].split()
    txt_input = [i for i in a if i != '<OOV>']
    txt_input = ' '.join(txt_input)
    print('txt_input: ', txt_input)
    
    # Real answer
    y_real = decode_y(num, Y_seq, y)
    print('Real sentence: ', txt_input + ' ' + y_real)

# --------------------------------

y_word, y_word_list, probabilities = decodetext2seq_predict_decodeseq2text(txt_input, tokenizer, MAXLEN, PADDING, model, label_tokenizer, Y_seq, y)

# --------------------------------

# Predicted answer
for i in y_word_list:
    print('Predicted sentence: ', txt_input + ' ' + i)

# --------------------------------

df = evaluate_word_selection(probabilities, Y_seq, y)
df


token_list:  [11, 6, 1, 33]
sorted_prob:  [0.1702227  0.12745717 0.1014429  0.10117909 0.08956861 0.08351177
 0.07953458 0.06113645 0.05223711 0.03478472]
selected_y_list:  [52  1 37 23  2 48 53 33 54 35]
Predicted sentence:  for breakfast I eat movement
Predicted sentence:  for breakfast I eat night
Predicted sentence:  for breakfast I eat region
Predicted sentence:  for breakfast I eat ramadan
Predicted sentence:  for breakfast I eat meal
Predicted sentence:  for breakfast I eat eggs
Predicted sentence:  for breakfast I eat states
Predicted sentence:  for breakfast I eat mid1600s
Predicted sentence:  for breakfast I eat vitamins
Predicted sentence:  for breakfast I eat drink


Unnamed: 0,y_word_tflookup,y_word_ylookup,Y_seq,probabilities
0,,movement,42,0.170223
1,,night,8,0.127457
2,,region,30,0.101443
3,,ramadan,20,0.101179
4,,meal,2,0.089569
...,...,...,...,...
65,,hours,21,0.000023
66,,"breakfast""",1,0.000023
67,,beans,34,0.000022
68,,breakfast,1,0.000017


In [77]:
# y_real = 'juice'
# y_real = 'milk'
df[(df['y_word_ylookup'] == y_real)]


Unnamed: 0,y_word_tflookup,y_word_ylookup,Y_seq,probabilities
61,,pharaohs,9,2e-06


In [84]:
df[(df['y_word_ylookup'] == 'night')]

Unnamed: 0,y_word_tflookup,y_word_ylookup,Y_seq,probabilities
1,,night,8,0.127457


In [None]:
df[(df['y_word_ylookup'] == 'eggs')]

## These results are not bad, but they are not perfect. The best grammatical word is always in the top 10!

# There are several ways to make the result better :

- Option 0: I could train the model with more breakfast data OR text data
- Option 1: I could use a pre-trained model (fine tuning)
- Option 2: I could use a pre-trained model (transfer learning)