In [None]:
#load Text file
#filepath = '/kaggle/input/shakespeare/shakespeare.txt'
filepath = '/kaggle/input/orwell1984/Orwell1984.txt'
with open(filepath) as f:
    document_text = f.read()


In [None]:
#check data
print(document_text[:80])

In [None]:
#Encode characters
from tensorflow import keras

text_vec_layer = keras.layers.TextVectorization(split="character", 
                                                standardize="lower")
text_vec_layer.adapt([document_text])
encoded = text_vec_layer([document_text])[0]

In [None]:
#Reduce encoding by 2 as we don't need 0 and 1 reserved by TextVectorization
encoded = encoded - 2 
n_tokens = text_vec_layer.vocabulary_size() - 2 
dataset_size = len(encoded) 
dataset_size

In [None]:
n_tokens

In [None]:
#function to prepare dataset
import tensorflow as tf

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [None]:
#create test, validation and testing datasets
tf.random.set_seed(42)

length = 100
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

In [None]:
#create model
model = tf.keras.Sequential([
    keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dense(n_tokens, activation="softmax")])

In [None]:
#compile model
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="nadam", 
              metrics=["accuracy"])

In [None]:
#set callbacks
model_ckpt = keras.callbacks.ModelCheckpoint("my_document_model.keras", 
                                                monitor="val_accuracy", 
                                                save_best_only=True)

In [None]:
#train model
history = model.fit(train_set, 
                    validation_data=valid_set, 
                    epochs=5, 
                    callbacks=[model_ckpt])

In [None]:
#save model
model = model.save('my_model.keras')

In [None]:
#load model
model = keras.models.load_model("/kaggle/working/my_model.keras")

In [None]:
#data processing for input text going to model at end
document_model = keras.Sequential([text_vec_layer,
                                         keras.layers.Lambda(lambda X: X - 2),
                                         model
                                        ])

In [None]:
#new input text

#input_text = "To be or not to b"
input_text = "To obey the law is "
y_proba = document_model.predict(tf.constant([input_text]) )[0, -1]

In [None]:
#predict new character
y_pred = tf.argmax(y_proba)
text_vec_layer.get_vocabulary()[y_pred + 2]

In [None]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

In [None]:
#function to generate next character
def next_char(text, temperature=1):
    y_proba = document_model.predict(tf.constant([text]))[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [None]:
#function to generate several characters
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
tf.random.set_seed(42)

In [None]:
#testing
print(extend_text(input_text, temperature=0.01))

In [None]:
print(extend_text(input_text, temperature=1))

In [None]:
print(extend_text(input_text, temperature=1.5))

In [None]:
print(extend_text(input_text, temperature=10))

In [None]:
#================= Chatbot ================
#The following has been tested in Google colab

In [None]:
%%writefile app.py

import streamlit as st
import pickle
import tensorflow as tf
from tensorflow import keras

filepath = '/content/Orwell1984.txt'
with open(filepath) as f:
    document_text = f.read()

text_vec_layer = keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([document_text])
# how to save vocabulary after adapt

model = keras.models.load_model('/content/my_model.keras')

#take input and lead to model
document_model = keras.Sequential([text_vec_layer,
                                   keras.layers.Lambda(lambda X: X - 2),
                                   model])

#function to generate next character
def next_char(text, temperature=1):
    y_proba = document_model.predict(tf.constant([text]))[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

#function to generate several characters
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

st.title('GRU Text generator')
st.write('Using Orwell 1984')

user_input = st.chat_input(placeholder="User")
if user_input:
    pred = extend_text(user_input, temperature=1)
    st.write(pred)

In [None]:
!pip install streamlit
!npm install localtunnel

In [None]:
!npm audit fix --force