<a href="https://colab.research.google.com/github/linkvarun/Jupyter_Notebook/blob/master/Text_generation_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word-level text generation with Keras in <50 lines of code

## Get the IMDB movie review dataset

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2023-07-10 02:42:48--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-07-10 02:42:49 (70.8 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



## Create a Dataset (one text file = one sample)

In [None]:
from tensorflow import keras
import tensorflow as tf
dataset = keras.preprocessing.text_dataset_from_directory('aclImdb', label_mode=None, batch_size=256)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " "))

Found 100006 files belonging to 1 classes.


## Prepare a text vectorization layer and compute the vocabulary

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

sequence_length = 100
vectorize_layer = TextVectorization(
    max_tokens=15000,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
)
vectorize_layer.adapt(dataset)
vocab = vectorize_layer.get_vocabulary()
tokens_index = dict(enumerate(vocab))

In [None]:
vocab

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'you',
 'not',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'or',
 'just',
 'her',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'when',
 'more',
 'very',
 'up',
 'no',
 'even',
 'my',
 'would',
 'she',
 'time',
 'their',
 'which',
 'only',
 'really',
 'story',
 'see',
 'were',
 'can',
 'had',
 'me',
 'than',
 'well',
 'much',
 'we',
 'been',
 'get',
 'bad',
 'into',
 'also',
 'great',
 'do',
 'other',
 'will',
 'people',
 'because',
 'him',
 'first',
 'how',
 'most',
 'dont',
 '0',
 'them',
 'films',
 'movies',
 'then',
 'make',
 'made',
 'way',
 'could',
 'too',
 'characters',
 'after',
 'any',
 'think',
 'watch',
 'many',
 'being',
 'two',
 'character',
 'seen',
 'never',
 'plot',
 'love',
 'where',
 'little',
 'actin

## Create a dataset of sequences of vectorized words, with targets offset by one step

In [None]:
def make_lm_dataset(text_batch):
    tokenized = vectorize_layer(text_batch)
    return tokenized[:, :-1], tokenized[:, 1:]

lm_dataset = dataset.map(make_lm_dataset).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
lm_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(None, 100), dtype=tf.int64, name=None))>

## Our model: a stacked LSTM that returns output sequences the same length as its input

In [None]:
from tensorflow import keras
from keras import layers

inputs = keras.Input(shape=(None,), dtype='int64')
x = layers.Embedding(len(vocab), 256)(inputs)
x = layers.LSTM(256, return_sequences=True)(x)
x = layers.LSTM(256, return_sequences=True)(x)
outputs = layers.Dense(len(vocab), activation='softmax')(x)
model = keras.Model(inputs, outputs)

## Prepare a text generation callback with variable-temperature sampling

In [None]:
import numpy as np

def decode_token_indices(indices):
    return ' '.join([tokens_index[i] for i in indices])

def sample_next(preds, temperature=1.0):
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.argmax(np.random.multinomial(1, preds, 1))


class TextGenerator(keras.callbacks.Callback):

    def __init__(self,
                 prompt,
                 generate_length,
                 model_input_length,
                 temperatures=(1.,)):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures

    def on_epoch_end(self, epoch, logs=None):
        for temperature in self.temperatures:
            print('== Generating with temperature', temperature)
            token_sequence = self.prompt[:]
            tokens_generated = []
            while len(token_sequence) - len(self.prompt) < self.generate_length:
                model_input = tf.convert_to_tensor([token_sequence])
                preds = self.model.predict(model_input).astype('float64')
                next_token = sample_next(preds[0, -1], temperature=temperature)
                token_sequence.append(next_token)
            print(decode_token_indices(token_sequence))


text_prompt = "This movie"
prompt = list(vectorize_layer([text_prompt]).numpy()[0])[:2]
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=sequence_length,
    temperatures=(0.1, 0.2, 0.5, 0.7, 1., 1.5))

## Train the model, generating text at different temperatures after each epoch

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop')
model.fit(lm_dataset, epochs=200, callbacks=[text_gen_callback])

Epoch 1/200
this movie the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
== Generating with temperature 0.2
this movie [UNK] the the the the the the the the the the the the the the the the the the the the the a the the the the the the the the the the the the the the the  the the the the the the the the a the the
== Generating with temperature 0.5
this movie [UNK] was [UNK] this the the  you [UNK] the the [UNK] the the  the the a i a  in the [UNK] that the with of   the the a the [UNK] the and the [UNK] [UNK] and the the and the this [UNK] a the the
== Generating with temperature 0.7
this movie only [UNK] me the the  only work from not the and the the title the like a them like the a like the the of the nightmare the a you a a the the i version a the my  plot in the the can was i [UNK] of
== Generating with temperature 1.0
this movie work 

## Some example results after 200 epochs


With `temperature=0.2`:
* "this movie is a [UNK] of the original movie and the first half hour of the
movie is pretty good but it is a very good movie it is a good movie for the time period"
* "this movie is a [UNK] of the movie it is a movie that is so bad that it is a
[UNK] movie it is a movie that is so bad that it makes you laugh and cry at
the same time it is not a movie i dont think ive ever seen"

With `temperature=0.5`:
* "this movie is a [UNK] of the best genre movies of all time and it is not a
good movie it is the only good thing about this movie i have seen it for the first
time and i still remember it being a [UNK] movie i saw a lot of years"
* "this movie is a waste of time and money i have to say that this movie was a
complete waste of time i was surprised to see that the movie was made up of
a good movie and the movie was not very good but it was a waste of time and"

With `temperature=0.7`:
* "this movie is fun to watch and it is really funny to watch all the characters
are extremely hilarious also the cat is a bit like a [UNK] [UNK] and a hat [UNK]
the rules of the movie can be told in another scene saves it from being in the back of"
* "this movie is about [UNK] and a couple of young people up on a small boat in the middle
of nowhere one might find themselves being exposed to a [UNK] dentist they are killed by
[UNK] i was a huge fan of the book and i havent seen the original so it"

With `temperature=1.0`:
* "this movie was entertaining i felt the plot line was loud and touching but
on a whole watch a stark contrast to the artistic of the original we watched
the original version of england however whereas arc was a bit of a little too
ordinary the [UNK] were the present parent [UNK]"
* "this movie was a masterpiece away from the storyline but this movie was
simply exciting and frustrating it really entertains friends like this the actors
in this movie try to go straight from the sub thats image and they make it a
really good tv show"

With `temperature=1.5`:
* "this movie was possibly the worst film about that 80 women its as weird
insightful actors like barker movies but in great buddies yes no decorated
shield even [UNK] land dinosaur ralph ian was must make a play happened falls
after miscast [UNK] bach not really not wrestlemania seriously sam didnt exist"
* "this movie could be so unbelievably lucas himself bringing our country
wildly funny things has is for the garish serious and strong performances
colin writing more detailed dominated but before and that images gears burning
the plate patriotism we you expected dyan bosses devotion to must do your own duty and another"