# Bigram NLP (Next character) Generation


In [16]:
import numpy as np
import tensorflow as tf

In [2]:
# Get Shakespeares work from Andrej Karpathy's website

url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', url)

with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 21us/step


In [3]:
# Print the first few characters
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [11]:
unique_chars = sorted(set(shakespeare_text))
char_to_int = {char: idx for idx, char in enumerate(unique_chars)}

In [13]:
# char_to_init = {char: ord(char) for char in set(unique_chars)} # for ASCII code

In [23]:
# How many number of distinct characters has the vocabulary:
tokens = len(unique_chars)
print(f'Number of tokens in vocabulary: {tokens}')

# How many characters has the dataset:
text_length = len(shakespeare_text)
print(f'Total length of text dataset: {text_length}')

Number of tokens in vocabulary: 65
Total length of text dataset: 1115394


### Bigram model


In [27]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(tokens, 10))

input_array = np.random.randint(tokens, size=(1, 1))
model.compile('rmsprop', 'sparse_categorical_crossentropy')

output_array = model.predict(input_array)
print(output_array.shape)

model.summary()
# (1, 1, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
(1, 1, 10)


In [28]:
# We need to break the total sequence into smaller chunks for our prediction model
# These chunks are of size "length" and are shifted by one character
# between input and output.
def get_dataset(sequence, length, shuffle=False, batch_size=128):

    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(length + 1))
    if shuffle:
        dataset = dataset.shuffle(len(sequence))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:])).prefetch(1)
    return dataset

In [None]:
# Prediction size for the model
length = 1

# Split the dataset into training, validation and testing
TRAIN_SPLIT = int(0.9 * text_length)
TEST_SPLIT = int(0.95 * text_length)

train_dataset = get_dataset(encoded_text[:TRAIN_SPLIT], length, shuffle=True)
val_dataset = get_dataset(encoded_text[TRAIN_SPLIT:TEST_SPLIT], length, shuffle=False)
test_dataset = get_dataset(encoded_text[TEST_SPLIT:], length, shuffle=False)