In [57]:
"""
This file should contain CNN-type neural networks that process sentences as frame+variable
Presently, it is implemented with keras
"""
import warnings # suppress some warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [74]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dropout, Flatten, Input, Reshape, Embedding
from tensorflow.keras.layers import Conv1D, Conv2D, Dense, MaxPooling1D, MaxPooling2D
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import concatenate
from tensorflow.keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy, MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay, PiecewiseConstantDecay, PolynomialDecay, InverseTimeDecay
from tensorflow.keras.utils import to_categorical, plot_model

In [56]:
import numpy as np

In [55]:
import spacy
nlp = spacy.load("en_core_web_md")

In [5]:
# pretend_X = []
# pretend_sentences = ["I have one dream .", "The plane landed outside of the airport .", "testing testing"]
# pretend_max_sent_len = 10
# for s in pretend_sentences:
#     vecs = np.zeros((pretend_max_sent_len, 300))
#     tokens = nlp(s)
#     for i, t in enumerate(tokens):
#         vecs[i] = t.vector
#     pretend_X.append(vecs)
# pretend_X = np.array(pretend_X)

# pretend_Y = [3, 5, 0]
# pretend_Y = to_categorical(pretend_Y, num_classes=pretend_max_sent_len)

In [6]:
# TODO: can add attention? can use recurrent structure?
class textToVarNN:
    # can increase if computer has GPU (may want to check if GPU is in use by tensorflow)
    batch_size = 16 
    # at least 200 for actual training, use less epochs for tweaking when you can see patterns quickly
    epochs = 20
    max_sentence_len = 10 # EDITable value; make sure same for both networks
    word_vec_len = 300 # shape of spacy word vec

    def __init__(self):
        model = Sequential()
        
        # input indicates shape of X for a single entry
        model.add(Input(shape=(self.max_sentence_len, self.word_vec_len)))
        
        # the middle layers can be TWEAKed
        # to find a configuration that gives good results
        # candidate layers: Conv1D, Conv2D, Dense, Dropout, Flatten, MaxPooling1D, MaxPooling2D
        # candidate activations: relu, softmax, sigmoid, None
        # an advanced activation is also available as a layer: LeakyReLU()
        # Note: it is not usually useful to have more than 10 layers
        # Note: first dimension of a layer is most efficient at 2^n for some n
        model.add(Dense(128, activation='relu'))
        model.add(Dense(32))
        
        # output has to be within sentence len, 
        # since training Y is going to be one-hot vectors 
        # indicating which one of the input words is the variable.
        # alternatively, look into the CategoryEncoding layer 
        model.add(Flatten())
        model.add(Dense(self.max_sentence_len, activation='softmax'))
        
        # the learning rate can be TWEAKed
        # see https://keras.io/api/optimizers/ 
        # see https://keras.io/api/optimizers/learning_rate_schedules/
        lr_schedule = ExponentialDecay(
            initial_learning_rate=1e-2,
            decay_steps=10000,
            decay_rate=0.9)
        optimizer = Adam(learning_rate=lr_schedule)
        
        model.compile(loss='categorical_crossentropy',
                     optimizer=optimizer)
        self.model = model

        # Note: if it runs very slow, or stalls the computer, 
        # the reason might be too many trainable parameters.
        # remember a batch of X, Y, and the parameters are all
        # held in memory when this is being trained
        print(self.model.summary())

    # takes 2 ndarrays as input
    def train(self, X, Y, batch_size=batch_size, epochs=epochs):
        # when it trains, the validation error should be trending down
        self.model.fit(X, Y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

    # takes 1 ndarray as input
    def predict(self, X):            
        # return one from predictions using weighted chance
        pred_Y = self.model.predict(X)
        # picks a random index, using probabilities weighted by pred_Y[i]
        pred = [np.random.choice(range(len(x)), p=x) for x in pred_Y]
        return pred

In [None]:
# test = textToVarNN()
# test.train(pretend_X, pretend_Y, epochs=4)
# testpred = test.predict(pretend_X)

In [10]:
# plot_model(test.model, "temp.png", show_shapes=True)

In [78]:
pretend_X_frame = []
pretend_X_varindex = np.array([2, 6, 1])
pretend_Y_var = []
pretend_sentences = ["I have one dream .", "The plane landed outside of the airport .", "testing testing"]
pretend_max_sent_len = 10
for j, s in enumerate(pretend_sentences):
    vecs = np.zeros((pretend_max_sent_len, 300))
    tokens = nlp(s)
    for i, t in enumerate(tokens):
        if pretend_X_varindex[j] == i:
            pretend_Y_var.append(t.vector)
        else:
            vecs[i] = t.vector
    pretend_X_frame.append(vecs)
pretend_X_frame = np.array(pretend_X_frame) 
pretend_Y_var = np.array(pretend_Y_var)

pretend_X_varindex = to_categorical(pretend_X_varindex, num_classes=pretend_max_sent_len)

pretend_Y_frame = pretend_X_frame
pretend_Y_varindex = pretend_X_varindex

In [103]:
# TODO: use GloVe embedding directly from keras, rather than through SpaCy. 
class frameToVarNN:
    # can increase if computer has GPU (may want to check if GPU is in use by tensorflow)
    batch_size = 16 
    # at least 200 for actual training, use less epochs for tweaking when you can see patterns quickly
    epochs = 20
    max_sentence_len = 10 # EDITable value; make sure same for both networks
    word_vec_len = 300 # shape of spacy word vec
    
    def __init__(self):
        # here, it probably makes the job for the nn easier if we leave a blank at varindex for frame input
        frame_input = Input(shape=(self.max_sentence_len, self.word_vec_len), name='frame_input')
        varindex_input = Input(shape=(self.max_sentence_len, ), name='varindex_input') # can use embedding layer here
        
        # the middle layers can be TWEAKed
        
        frame_features = Dense(128)(frame_input)
        frame_features = Flatten()(frame_features)
        
        varindex_features = varindex_input
        
        features = concatenate([frame_features, varindex_features])
        features = Dense(128)(features)
        features = Dense(self.max_sentence_len * 16)(features)
        features = Reshape((self.max_sentence_len, 16))(features)

        frame_pred = Dense(self.word_vec_len, name='frame_pred')(features)
        
        features = Flatten()(features)
        varindex_pred = Dense(self.max_sentence_len, name='varindex_pred', activation='softmax')(features)
        var_pred = Dense(self.word_vec_len, name='var_pred')(features)
        
        model = Model(inputs=[frame_input, varindex_input], outputs=[frame_pred, varindex_pred, var_pred])
        
        # the learning rate can be TWEAKed
        # see https://keras.io/api/optimizers/ 
        # see https://keras.io/api/optimizers/learning_rate_schedules/
        lr_schedule = ExponentialDecay(
            initial_learning_rate=1e-2,
            decay_steps=10000,
            decay_rate=0.9)
        optimizer = Adam(learning_rate=lr_schedule)
        
        model.compile(loss=[
            MeanSquaredError(),
            CategoricalCrossentropy(),
            MeanSquaredError()
        ], optimizer=optimizer)

        self.model = model
#         print(model.summary())
        
    def train(self, X_frame, X_varindex, Y_frame, Y_varindex, Y_var, batch_size=batch_size, epochs=epochs):
        # when it trains, the validation error should be trending down
        self.model.fit(
            {'frame_input': X_frame, 'varindex_input': X_varindex},
            {'frame_pred': Y_frame, 'varindex_pred': Y_varindex, 'var_pred': Y_var},
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.2
        )
    
    def predict(self, X_frame, X_varindex):
        # TODO: return one variable from predictions using weighted chance
        # TODO requires predicting categorical var rather than word vector
        _, pred_index, pred_var = self.model.predict([X_frame, X_varindex])
        # picks a random index, using probabilities weighted by pred_index[i]
        pred_index = [np.random.choice(range(len(x)), p=x) for x in pred_index]
        return (pred_index, pred_var)

In [None]:
# test = frameToVarNN()
# test.train(pretend_X_frame[:2], pretend_X_varindex[:2], pretend_Y_frame[:2], pretend_Y_varindex[:2], pretend_Y_var[:2],
#            epochs=6)
# testpred = test.predict(pretend_X_frame, pretend_X_varindex)
# print(testpred)


In [None]:
# plot_model(test.model, "temp.png", show_shapes=True)