The steps of the approach are outlined below:
1. Convert abstracts from list of strings into list of lists of integers (sequences)
2. Create feature and labels from sequences
3. Build LSTM model with Embedding, LSTM, and Dense layers
4. Load in pre-trained embeddings
5. Train model to predict next work in sequence
6. Make predictions by passing in starting sequence

In [1]:
import json
import numpy as np

In [2]:
features = []
labels = []

with open("../../data/domain2_train.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        features.append(data["text"])
        labels.append(data["label"])

In [3]:
h_len = 1000
v_len = len(features)

In [4]:
training_matrix = np.zeros((v_len, h_len), dtype=int)
result_matrix = np.zeros(v_len, dtype=int)

In [5]:
i = 0
for f, l in zip(features, labels):
    if len(f) < h_len:
        elements_to_add = h_len - len(f)
        training_matrix[i] = np.pad(f, (0, elements_to_add), 'constant')
    else:
        training_matrix[i] = np.array(f[:h_len])
    result_matrix[i] = l
    
    i = i + 1

In [6]:
X = training_matrix
y = result_matrix

In [7]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [8]:
# X = None
# y = None
# start = True
# size = 5000

# count = 0

# with open("../../data/domain1_train.json") as f:
#     for line in f:
#         # read line by line
#         data = json.loads(line)
        
#         # Bag of Words
#         new_line = np.zeros((1, size))
#         for i in data["text"]:
#             new_line[0][i] = new_line[0][i] + 1

#         if start:
#             X = np.array(new_line)
#             y = np.array(data["label"])
#             start = False
#         else:
#             X = np.append(X, new_line, axis=0)
#             y = np.append(y, data["label"])

#         count = count + 1
#         if count == 1000:
#             break

In [9]:
X.shape

(14900, 1000)

In [10]:
y.shape

(14900,)

In [11]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

num_features = 5000
sequence_length = X.shape[1]
# embedding_matrix = []

model = Sequential()

# Embedding layer
model.add(
    Embedding(input_dim=num_features,
              output_dim=100,
              input_length = sequence_length,
              # weights=[embedding_matrix],
              # trainable=False,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         500000    
                                                                 
 masking (Masking)           (None, 1000, 100)         0         
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 546465 (2.08 MB)
Trainable params: 546465 

In [13]:
EPOCHS = 10
BATCH_SIZE = 2048

history = model.fit(
    X,
    y,
    epochs=10,
    batch_size=BATCH_SIZE
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
result = model.predict(X)



In [20]:
result

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)

In [16]:
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from keras.optimizers import Adam

from keras.utils import plot_model

In [17]:
def make_word_level_model(num_words,
                          embedding_matrix,
                          lstm_cells=64,
                          trainable=False,
                          lstm_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of LSTM cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple LSTM layers
    if lstm_layers > 1:
        for i in range(lstm_layers - 1):
            model.add(
                LSTM(
                    lstm_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final LSTM cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                LSTM(
                    lstm_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            LSTM(
                lstm_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
    model.add(Dense(128, activation='relu'))
    # Dropout for regularization
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model


In [18]:
num_words = 50
embedding_matrix = np.zeros((num_words, vectors.shape[1]))
LSTM_CELLS = 64


RANDOM_STATE = 50
EPOCHS = 150
BATCH_SIZE = 2048
TRAINING_LENGTH = 50
TRAIN_FRACTION = 0.7
LSTM_CELLS = 64
VERBOSE = 0
SAVE_MODEL = True

NameError: name 'vectors' is not defined

In [None]:
model = make_word_level_model(
    num_words,
    embedding_matrix=embedding_matrix,
    lstm_cells=LSTM_CELLS,
    trainable=False,
    lstm_layers=1)
model.summary()

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, Dense


In [None]:
sequence_length = 100
input_dim = 5000
output_dim = 1

num_epochs = 100
batch_size = 1000

In [None]:
model = tf.keras.Sequential([
    SimpleRNN(units=64, activation='tanh', return_sequences=True, input_shape=(sequence_length, input_dim)),
    Dense(units=output_dim, activation='softmax')
])


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
X.shape

(100, 5000)

In [None]:
model.fit(X, y, epochs=num_epochs, batch_size=batch_size)


Epoch 1/100


ValueError: in user code:

    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/engine/training.py", line 1080, in train_step
        y_pred = self(x, training=True)
    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/jonghopark/miniconda3/lib/python3.9/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_6" is incompatible with the layer: expected shape=(None, 100, 5000), found shape=(1000, 5000)


In [None]:
loss, accuracy = model.evaluate(X_val, y_val)
predictions = model.predict(X_test)


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=64, return_sequences=True))
model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
