In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


## EDA

In [2]:
data = pd.read_csv("train.csv")
print(data.head())
print("Data shape:", data.shape)
print("Missing values:", data.isnull().sum())
print(data.describe())

                                              Quotes
0  Embrace the beauty of every sunrise; it's a fr...
1  Embrace challenges; they are the stepping ston...
2  Embrace the rhythm of life and let it dance th...
3  Embrace kindness, for it has the power to chan...
4  Embrace the journey, for it leads to the desti...
Data shape: (1000, 1)
Missing values: Quotes    0
dtype: int64
                                                   Quotes
count                                                1000
unique                                                890
top     Radiate acceptance, and find peace in embracin...
freq                                                    5


In [3]:
data = data.drop_duplicates()
data = list(data['Quotes'])

In [None]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(data)
tokenizer.word_index

In [5]:
print(data[0])
print(tokenizer.texts_to_sequences([data[1]]))

Embrace the beauty of every sunrise; it's a fresh chance to paint your world with joy.
[[17, 148, 33, 20, 1, 373, 487, 10, 3, 374, 687]]


In [6]:
sequences = tokenizer.texts_to_sequences(data)
padded_sequences = pad_sequences(sequences, padding="pre")

In [7]:
print(len(padded_sequences[0]))

35


In [8]:
word_freq = tokenizer.word_counts
sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
print(sorted_word_freq[:10])

num_unique_words = len(tokenizer.word_index)
print(f"There are {num_unique_words} unique words.")

[('the', 869), ('of', 663), ('your', 350), ('and', 322), ('a', 307), ('is', 253), ('in', 249), ('for', 201), ('let', 187), ('to', 180)]
There are 1198 unique words.


There's 890 Quotes and 1198 Unique Words. In those 890 quotes we have 869 'the' and 663 'of' it's very likely that the model we build will often predict those 2 phrases very often.

### How to deal with 'the' and 'of'
We can create new data, that is less biased towards the 'the' and 'of'.  
Slice the quotes into different sizes and then train the model on that

In [9]:
# Split the data into 2, 3 words
phrases = []
for quote in sequences:
    for i in range(len(quote) - 1):
        phrases.append(quote[i : i + 2])  # create 2-word phrases
    for i in range(len(quote) - 2):
        phrases.append(quote[i : i + 3])  # create 3-word phrases

total_data = phrases + sequences

# pad the sequences
max_sequence_len = max([len(x) for x in total_data])
padded_sequences = pad_sequences(total_data, maxlen=max_sequence_len, padding="pre")

In [10]:
# Split the data into X and Y
X = padded_sequences[:, :-1]
labels = padded_sequences[:, -1]

In [11]:
labels_encoded = tf.keras.utils.to_categorical(labels, num_classes=num_unique_words + 1)
print(X.shape)
print(labels_encoded.shape)

(19458, 34)
(19458, 1199)


### Train the inital model

In [None]:
vocab_size = len(tokenizer.word_index) + 1


def first_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=max_sequence_len - 1))
    model.add(LSTM(100, return_sequences=False))
    model.add(Dense(vocab_size, activation="softmax"))
    model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return model


model = first_model()
history = model.fit(
    X, labels_encoded, batch_size=128, epochs=100, verbose=1, shuffle=True
)

In [None]:
def tune_model(units, batch_size, number_layers):
    model = Sequential()

    model.add(Embedding(vocab_size, 50, input_length=max_sequence_len - 1))
    for i in range(number_layers - 1):
        model.add(LSTM(units, return_sequences=True))
        model.add(tf.keras.layers.LayerNormalization())

    model.add(LSTM(units, return_sequences=False))
    model.add(tf.keras.layers.LayerNormalization())

    model.add(Dense(vocab_size, activation="softmax"))
    model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )

    history = model.fit(
        X,
        labels_encoded,
        batch_size=batch_size,
        epochs=50,
        verbose=1,
        shuffle=True,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                monitor="loss", patience=3, min_delta=0.0001, restore_best_weights=True
            )
        ],
    )
    return history


def grid_search(units, batch_size, number_layers):
    results = []
    for batch_size in batch_sizes:
        for number_layer in number_layers:
            for unit in units:
                history = tune_128_model(unit, batch_size, number_layer)
                # Store the results
                results.append(
                    {
                        "batch_size": batch_size,
                        "dropout": dropout,
                        "dense_size": dense_size,
                        "loss": history.history["loss"],
                        "accuracy": history.history["accuracy"],
                    }
                )
    return results


units = [32, 64, 128]
batch_sizes = [32, 64, 128]
number_layers = [1, 2, 3]

In [None]:
results = grid_search(units, batch_sizes, number_layers)

In [None]:
def predict_next_N_words_unique(seed_texts, top_p=1, N_words=10):
    generated_texts = []

    for seed_text in seed_texts:
        current_generated_text = seed_text
        for _ in range(N_words):
            # Tokenize the input sequence
            seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]
            # Pad the sequence if needed
            padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
                [seed_sequence], maxlen=max_sequence_len - 1
            )
            # Get the model's prediction for the next word
            predictions = final_model.predict(padded_sequence, verbose=0)[0]

            # Apply top-p sampling
            sorted_indices = np.argsort(predictions)[::-1]
            cumulative_probs = np.cumsum(predictions[sorted_indices])
            selected_indices = sorted_indices[cumulative_probs <= top_p]

            # Normalize probabilities
            selected_probs = predictions[selected_indices] / np.sum(
                predictions[selected_indices]
            )

            # Sample from the selected indices based on the normalized probabilities
            next_index = np.random.choice(selected_indices, p=selected_probs)

            # Convert the index back to a word
            next_word = tokenizer.index_word[next_index]
            # print(next_word)
            # Break if the generated text is too long or if an end token is predicted
            if (
                next_word is None
                or next_word == "end_token"
                or len(current_generated_text.split()) >= N_words + len(seed_text)
            ):
                break

            # Update the generated text and seed_text for the next iteration
            current_generated_text += " " + next_word
            seed_text += " " + next_word

        generated_texts.append(current_generated_text)

    return generated_texts

In [None]:
seed_texts = [
    "embrace each day",
    "radiate some",
    "believe that",
    "life's actual purpose is",
    "dance through each and every",
    "let your time and energy",
    "every person is",
    "our country Singapore is",
    "planet earth is",
    "morning and evening would make it",
]

In [None]:
predicted_texts = predict_next_N_words_unique(seed_texts)