In [None]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np
import string
import os
import gensim
import keras
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint

In [None]:
# data source: https://www.kaggle.com/datasets/adhok93/presidentialaddress/

df_speech = pd.read_csv('/content/inaug_speeches.csv', encoding= 'latin1')
df_speech = df_speech.drop(columns = "Unnamed: 0")
df_speech.head()


Unnamed: 0,Name,Inaugural Address,Date,text
0,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and o...
1,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again cal...
2,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in ..."
3,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801",Friends and Fellow-Citizens: CA...
4,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to ..."


In [None]:
# Clean Data
import nltk
nltk.download('punkt')

speech_lines = list()
lines = df_speech["text"].values.tolist()

for line in lines:
    tokens = word_tokenize(line)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [w for w in stripped if w.isalpha()]

    speech_lines.append(words)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Trun words into Word-2-Vec Embeddings
EMBEDDING_DIM = 20
model = gensim.models.Word2Vec(sentences=speech_lines,
                               vector_size= EMBEDDING_DIM,
                               window=5, workers=2, min_count=1)

# vocab size
words = list(model.wv.key_to_index)

In [None]:
file_name = "inaug_speech_word2vec.txt"
model.wv.save_word2vec_format(file_name, binary=False)

In [None]:
embeddings_index = {}

f = open(os.path.join("", "inaug_speech_word2vec.txt"), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:])
    embeddings_index[word]=coefs

f.close()

In [None]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(speech_lines)

# create word index dict
word_index = tokenizer_obj.word_index

# vocab
vocab_size = len(tokenizer_obj.word_index) + 1

# tokenize
speech_tokens = tokenizer_obj.texts_to_sequences(speech_lines)

In [None]:
# map embeddings from word2vec model for each word
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Prepare Train and Test DataSets

seq_len = 10
dataX=[]
dataY=[]

for i in range(len(speech_tokens)):
    for j in range(0, len(speech_tokens[i])-seq_len, 1):
        seq_in = speech_tokens[i][j:j+seq_len]
        seq_out = speech_tokens[i][j+seq_len]
        dataX.append(seq_in)
        dataY.append(seq_out)

# Reshape and Create Sparse Matrix
X = np.reshape(dataX, (len(dataX), seq_len))
y = keras.utils.to_categorical(dataY)

In [None]:
from keras.layers import Input  # Import Input function from Keras

# Assuming `seq_len`, `num_words`, `EMBEDDING_DIM`, and `embedding_matrix` are defined as before

model = Sequential()
model.add(Input(shape=(seq_len,)))  # Input layer for sequences of length seq_len
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=seq_len,
                            trainable=False)
model.add(embedding_layer)
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(num_words, activation="softmax"))  # Adjust output layer to match num_words
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
filepath = "weights.keras"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode="min")
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs = 5, batch_size= 128, callbacks = callbacks_list)

Epoch 1/5
Epoch 1: loss improved from inf to 6.49599, saving model to weights.keras
Epoch 2/5
Epoch 2: loss improved from 6.49599 to 6.13400, saving model to weights.keras
Epoch 3/5
Epoch 3: loss improved from 6.13400 to 5.95888, saving model to weights.keras
Epoch 4/5
Epoch 4: loss improved from 5.95888 to 5.83783, saving model to weights.keras
Epoch 5/5
Epoch 5: loss improved from 5.83783 to 5.73462, saving model to weights.keras


<keras.src.callbacks.History at 0x7e8db82a6110>

In [None]:
def generate_seq(model, tokenizer, enter_text, n_pred):
    in_text, result = enter_text, enter_text  #
    # generate a fixed number of words
    for _ in range(n_pred):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text.split()])
        encoded = np.array(encoded)

        # predict a word in the vocabulary
        ## yhat = model.predict_classes(encoded)
        predict_val = model.predict(encoded)
        yhat = np.argmax(predict_val, axis=-1)
        print(predict_val.shape)


        # map predicted word index to word
        out_word = ""
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + " " + out_word

    return result


In [None]:
input_seed = "my fellow americans it is time to stand together and"
print(generate_seq(model, tokenizer_obj, input_seed, 3))

(1, 9272)
(1, 9272)
(1, 9272)
my fellow americans it is time to stand together and the people the
