# Neural Network Train Predict

In [None]:
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import GRU
from keras.layers import Embedding
import pickle
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.constraints import max_norm
import numpy as np

Import processed data, along with relevant supporting data for the model training

In [None]:
# file names in the form date-time-word frequency minimum-sequence length-number of posts
pickle_file = "../processed_data/20200620-230848-50-4-5000"

with open(pickle_file, "rb") as f:
    X, y, tokenizer, len_vocab, ngram_len = pickle.load(f) 
    
print("Number of unique features", len(np.unique(X)))
print("Number of unique targets", len(np.unique(y)))

Dropout prevents overfitting, as does adding a norm to final dense kernel (in theory).

In [None]:
number_of_embeddings = 100
GRU_units = 256 #
dropout = 0.4

# define model
model = Sequential()
model.add(Embedding(len_vocab, number_of_embeddings, input_length=ngram_len-1))
if dropout > 0:
    model.add(Dropout(dropout))
model.add(GRU(GRU_units))
model.add(Dense(len_vocab, activation='softmax', kernel_constraint=max_norm(2.)))
print(model.summary())

In [None]:
# compile network
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Write callback section

In [None]:
filepath = "../best_callbacks/weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early_stopping = EarlyStopping(monitor="val_accuracy", patience=1, min_delta=0.005)
callbacks_list = [checkpoint, early_stopping]

In [None]:
len_seq = X.shape[0]


def build_model(X, y, v_split=0.2, number_of_epochs=10, 
                verbose=1, cbs=callbacks_list):
    """
    Function to fit model
    Using default parameters allows for simpler calling
    """
    model.fit(X, y, validation_split=v_split, epochs=num_epochs, 
              verbose=verbose, callbacks=cbs)
    
    
print("Total samples: ", len_seq)

Train the model, with callbacks ends far before max number of epochs (for small data) give the monitoring of val_loss, runs for longs if monitoring val_acc as distributions of final output change increasing acc but also increasing loss. (discrete vs contiuous metrics). Trying both val_loss and val_acc.


# Model training


In [None]:
# fit network

num_epochs = 20
#batch_n = 50

validation_split = 0.33

build_model(X, y, v_split=validation_split, number_of_epochs=num_epochs)

Function that will convert from the embedded numbers to text with a number of predictions.

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integers
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        #TODO// More efficient way to do this than BF search
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [None]:

test = "Reddit please help my wife has"

keras_embedder = tokenizer

generate_seq(model, keras_embedder, ngram_len-1, test, 15)

In [None]:
from datetime import date
write = False
today = str(date.today()) + "\n"



tests = ["My wife is not", "My husband has a", "My friend can", "My fiance did", 
         "My gf has not", "My girlfriend wants to",
         "My girlfriend did not", "My boyfriend can", "My partner will", 
         "My wife (23F)", "My spouse is", "He does not", 
         "Help I", "Is there no", "I don't know", "My (18F) boyfriend", "There is a",
         "Help, my (21F) boyfriend has", "My colleague keeps making posts",
         "Me (52M) not sure about my", "I need help with",
         "Does anyone know how I (41M)",
         "Can I get", "My in laws can",
         "Help reddit, my husband is",
         "I can't keep going on", "aita for wanting"]

# How long should the generated sequence be
length = 25


if write:
    f = open("../tests/GRU_test_outputs_final.txt", "a+")
    f.write(str(today) + "4-5000")
    for each_test in tests:
        test_output = generate_seq(model, keras_embedder, 
                                   ngram_len-1, each_test, length) + "\n"
        f.write(test_output)
        print(test_output)
    f.close()
else:
    for each_test in tests:
        test_output = generate_seq(model, keras_embedder,
                                   ngram_len-1, each_test, length) + "\n"
        print(test_output)
        
        

In [None]:
model.save("../saved_models/GRU-model-ngram{}-units{}-nseq{}.h5".format(ngram_len, 
                                                                        GRU_units, 
                                                                        X.shape[0]))