<a href="https://colab.research.google.com/github/iam-pattan/DL-lab/blob/main/Lab_6(Text_gen_RNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Code-1

In [None]:
# This is more for fun than a puzzle
# 
# Find some text, you could download a book from project gutenberg, or r you could dump 
# all of the code in this project into one text file with 'cat ../../**/*.py > code.txt'
# 
# Next run this character-based GRU with char-gen.py some-text-file.txt
# If you are on a GPU you should use CuDNNGRU in place of GRU
# 
# See if you can get interesting results!  Play with the number of hidden nodes
# and try other RNN structures.  Modifying the diversity number doesn't affect
# the model but can lead to different output.
#
# This model loads all of the data into memory, and that will be huge (why?).
# Another fun project would be to use fit_generator to process a larger dataset.

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM, SimpleRNN
from keras.layers import CuDNNGRU, GRU
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import wandb
from wandb.keras import WandbCallback
import argparse

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("text", type=str)

args = parser.parse_args()

run = wandb.init()
config = run.config
config.hidden_nodes = 128
config.batch_size = 256
config.file = args.text
config.maxlen = 200
config.step = 3

In [None]:
# Only load first 100k charcters because we're not using memory efficiently
text = io.open(config.file, encoding='utf-8').read()[:100000]
chars = sorted(list(set(text)))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# build a sequence for every <config.step>-th character in the text

sentences = []
next_chars = []
for i in range(0, len(text) - config.maxlen, config.step):
    sentences.append(text[i: i + config.maxlen])
    next_chars.append(text[i + config.maxlen])

In [None]:
# build up one-hot encoded input x and output y where x is a character
# in the text y is the next character in the text

x = np.zeros((len(sentences), config.maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

model = Sequential()
model.add(GRU(config.hidden_nodes, input_shape=(config.maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="rmsprop")


In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [None]:
class SampleText(keras.callbacks.Callback):
    def on_epoch_end(self, batch, logs={}):
        start_index = random.randint(0, len(text) - config.maxlen - 1)

        for diversity in [0.5, 1.2]:
            print()
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + config.maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            sys.stdout.write(generated)

            for i in range(200):
                x_pred = np.zeros((1, config.maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x_pred[0, t, char_indices[char]] = 1.

                preds = model.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                sys.stdout.write(next_char)
                sys.stdout.flush()
            print()


In [None]:
model.fit(x, y, batch_size=config.batch_size,
          epochs=100, callbacks=[SampleText(), WandbCallback()])

## Code-2


In [None]:
# Small LSTM Network to Generate Text for Alice in Wonderland
import numpy
import sys
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [None]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [None]:
# Read, then decode for py2 compat.
raw_text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(raw_text)} characters')

Length of text: 1115394 characters


In [None]:
# Take a look at the first 250 characters in text
print(raw_text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [None]:
# load ascii text and covert to lowercase
raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  1115394
Total Vocab:  39


In [None]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length]
	seq_out = raw_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  1115294


In [None]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
# fit the model
model.fit(X[:100000], y[:100000], epochs=5, batch_size=128, callbacks=callbacks_list)

Epoch 1/5

Epoch 00001: loss improved from inf to 3.02751, saving model to weights-improvement-01-3.0275.hdf5
Epoch 2/5

Epoch 00002: loss improved from 3.02751 to 2.79993, saving model to weights-improvement-02-2.7999.hdf5
Epoch 3/5

Epoch 00003: loss improved from 2.79993 to 2.69333, saving model to weights-improvement-03-2.6933.hdf5
Epoch 4/5

Epoch 00004: loss improved from 2.69333 to 2.62835, saving model to weights-improvement-04-2.6283.hdf5
Epoch 5/5

Epoch 00005: loss improved from 2.62835 to 2.58072, saving model to weights-improvement-05-2.5807.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fe4cc2b9f10>

In [None]:
# load the network weights
filename = '/content/weights-improvement-01-3.0275.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
" eard it;
but, since it serves my purpose, i will venture
to stale 't a little more.

first citizen:
 "


In [None]:
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

to toe to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to to 

In [None]:
!cp '/content/weights-improvement-01-3.0275.hdf5' '/content/drive/MyDrive/Colab Notebooks'