In [5]:
import re, string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# load and clean the text for the neural network
with open("/Users/hasancan/Desktop/Projects/carpe_diem.txt", "r") as f:
    raw_text = f.read().lower()

#print(raw_text) # uncomment this to see the dataset

In [8]:
def clean_text(text):
    text = re.sub(r"[’'`]", "", text)   # remove apostrophes
    text = re.sub(r"\d+", "", text)     # remove digits
    text = re.sub(r"\s+", " ", text).strip() # remove whitespaces
    return text

text = clean_text(raw_text)
print(text[:500])  # peek

i always knew i was a competitive person and that i was motivated by the desire to be better than anyone else in my field. i never figured out what caused this motivation. my parents used to compare me with my peer, neighbors kid all the time, was that the reason? i always hated that neighbors kid. when we were kids, we fought often. i cant recall the reasons, though. however, i distinctly remember how smart he was and how successful he was at his classes. he never studied hard enough; he never 


In [15]:
# Initalize the tokenizer and count the vocab size:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
print("Vocabulary size is as follows: ", total_words)

# Convert to sequences
input_sequences = []
for line in text.split("."):   # treat each sentence-ish chunk separately
    tokens = tokenizer.texts_to_sequences([line])[0]
    for i in range(2, len(tokens)+1):
        ngram_seq = tokens[:i]
        input_sequences.append(ngram_seq)

print("Total sequences:", len(input_sequences))

# Pad sequences (same length)
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding="pre")

# Split predictors and label
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(y, num_classes=total_words)
print("X shape:", X.shape, "y shape:", y.shape)

Vocabulary size is as follows:  1361
Total sequences: 6315
X shape: (6315, 57) y shape: (6315, 1361)


In [22]:
# Build LSTM LM model

from tensorflow.keras.layers import Dropout

model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_seq_len-1))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))  # add dropout here
model.add(Dense(total_words, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [23]:
# Train the model
history = model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 94ms/step - accuracy: 0.0391 - loss: 6.4598
Epoch 2/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 102ms/step - accuracy: 0.0444 - loss: 5.8137
Epoch 3/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 102ms/step - accuracy: 0.0758 - loss: 5.6092
Epoch 4/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 104ms/step - accuracy: 0.0921 - loss: 5.2820
Epoch 5/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 102ms/step - accuracy: 0.1069 - loss: 4.9963
Epoch 6/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 111ms/step - accuracy: 0.1176 - loss: 4.7197
Epoch 7/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 109ms/step - accuracy: 0.1429 - loss: 4.4161
Epoch 8/50
[1m198/198[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 107ms/step - accuracy: 0.1659 - loss: 4.1196
Epoch 9/50
[1m19

In [24]:
import random

def generate_text(seed_text, next_words=20, temperature=1.0):
    result = []
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding="pre")
        preds = model.predict(token_list, verbose=0)[0]

        # apply temperature
        preds = np.asarray(preds).astype("float64")
        preds = np.log(preds + 1e-8) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        next_index = np.random.choice(len(preds), p=preds)
        next_word = tokenizer.index_word.get(next_index, "")
        if next_word == "":
            continue
        seed_text += " " + next_word
        result.append(next_word)
        if next_word == "end" or next_word == "</s>":
            break
    return " ".join(result)

# Try generating
print(generate_text("i", next_words=15, temperature=0.8))
print(generate_text("my parents", next_words=15, temperature=0.8))
print(generate_text("we", next_words=15, temperature=0.8))

wasnt expecting him to be impressed by it he said “its so beautiful that you
used to compare me with my peer neighbors kid all the time was that the
were free to write on whatever we desired for whatever we felt comfortable with romance
