In [None]:
import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt
import os
import re

In [None]:
# Download and load the Shakespeare text
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = tf.keras.utils.get_file('shakespeare.txt', url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [None]:
# Character-level tokenization
vocab = sorted(set(shakespeare_text))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for idx, char in enumerate(vocab)}
vocab_size = len(vocab)
print(f'Vocabulary size: {vocab_size}')
print(f'First 100 characters of text: {shakespeare_text[:100]}')

### Hyperparameters

In [None]:
# Hyperparameters
d_model = 256  # Embedding dimension
num_heads = 8  # Number of attention heads
dff = 512  # Feed-forward network dimension
max_seq_length = 100  # Maximum sequence length
batch_size = 64  # Batch size
dropout_rate = 0.1  # Dropout rate
num_layers = 4  # Number of encoder and decoder layers
learning_rate = 0.0001  # Learning rate
epochs = 30  # Number of training epochs

### Create dataset

In [None]:
# Prepare the dataset
def text_to_indices(text):
    return [char_to_idx[char] for char in text]

def indices_to_text(indices):
    return ''.join([idx_to_char[idx] for idx in indices])

def create_training_data(text, seq_length=100, batch_size=64):
    text_as_indices = text_to_indices(text)
    total_seq = len(text_as_indices) - seq_length
    
    # Convert to numpy array for efficient indexing
    text_as_indices_np = np.array(text_as_indices)
    
    # Create input and target sequences
    input_seqs = []
    target_seqs = []
    for i in range(0, total_seq, seq_length // 4):  # Use stride for more efficient data usage
        if i + seq_length >= total_seq:
            break
        input_seqs.append(text_as_indices_np[i:i+seq_length])
        target_seqs.append(text_as_indices_np[i+1:i+seq_length+1])
    
    # Convert to TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((input_seqs, target_seqs))
    
    # Batch and shuffle
    dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Create dataset
dataset = create_training_data(shakespeare_text, seq_length=max_seq_length, batch_size=batch_size)

### Positional Encoding

In [None]:
# Positional Encoding
def get_positional_encoding(max_seq_len, d_model):
    position = np.arange(max_seq_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    
    pos_encoding = np.zeros((max_seq_len, d_model))
    pos_encoding[:, 0::2] = np.sin(position * div_term)
    pos_encoding[:, 1::2] = np.cos(position * div_term)
    
    return tf.cast(pos_encoding[np.newaxis, ...], dtype=tf.float32)

### Scaled Dot-Product Attention

In [None]:
# Scaled Dot-Product Attention
def scaled_dot_product_attention(q, k, v, mask=None):
    # Calculate attention scores
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    
    # Scale attention scores
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    # Apply mask (if provided)
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    
    # Calculate attention weights
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    
    # Apply attention weights to value vectors
    output = tf.matmul(attention_weights, v)
    
    return output, attention_weights

### Multi-Head Attention

In [None]:
# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention"):
        super(MultiHeadAttention, self).__init__(name=name)
        self.num_heads = num_heads
        self.d_model = d_model
        
        assert d_model % self.num_heads == 0
        
        self.depth = d_model // self.num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth)."""
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        # Scaled dot-product attention
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
        
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights

### Point-wise Feed-Forward Network

In [None]:
# Point-wise Feed-Forward Network
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
        tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

### Encoder Layer

In [None]:
# Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask=None):
        attn_output, _ = self.mha(v=x, k=x, q=x, mask=mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
        
        return out2

### Decoder Layer

In [None]:
# Decoder Layer
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, 
            look_ahead_mask=None, padding_mask=None):
        # Masked multi-head attention (self-attention)
        attn1, attn_weights_block1 = self.mha1(v=x, k=x, q=x, mask=look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        # Multi-head attention (encoder-decoder attention)
        attn2, attn_weights_block2 = self.mha2(
            v=enc_output, k=enc_output, q=out1, mask=padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
        
        # Feed-forward network
        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
        
        return out3, attn_weights_block1, attn_weights_block2

### Encoder

In [None]:
# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                maximum_position_encoding, rate=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = get_positional_encoding(maximum_position_encoding, d_model)
        
        self.enc_layers = [
            EncoderLayer(d_model, num_heads, dff, rate) 
            for _ in range(num_layers)
        ]
        
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]
        
        # Embedding
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        # Positional encoding
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        # Encoder layers
        for i in range(self.num_layers):
            x = self.enc_layers[i](x=x, training=training, mask=mask)
        
        return x  # (batch_size, input_seq_len, d_model)

### Decoder

In [None]:
# Decoder
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                maximum_position_encoding, rate=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = get_positional_encoding(maximum_position_encoding, d_model)
        
        self.dec_layers = [
            DecoderLayer(d_model, num_heads, dff, rate) 
            for _ in range(num_layers)
        ]
        
        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, enc_output, training, 
            look_ahead_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        
        # Embedding
        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        # Positional encoding
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        # Decoder layers
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](
                x=x, 
                enc_output=enc_output, 
                training=training,
                look_ahead_mask=look_ahead_mask, 
                padding_mask=padding_mask)
            
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        
        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [None]:
# Create look-ahead mask for decoder
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

### Transformer

In [None]:
# Transformer
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
                target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                                input_vocab_size, pe_input, rate)
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                                target_vocab_size, pe_target, rate)
        
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
        
    def call(self, inp, tar, training, enc_padding_mask=None, 
            look_ahead_mask=None, dec_padding_mask=None):
        # Encoder
        enc_output = self.encoder(x=inp, training=training, mask=enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        
        # Decoder
        dec_output, attention_weights = self.decoder(
            x=tar, 
            enc_output=enc_output, 
            training=training, 
            look_ahead_mask=look_ahead_mask, 
            padding_mask=dec_padding_mask)
        
        # Final linear layer
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        
        return final_output, attention_weights

### Define Transformer

In [None]:
# Create the transformer model
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    pe_input=max_seq_length,
    pe_target=max_seq_length,
    rate=dropout_rate)

In [None]:
# Optimizer
optimizer = tf.keras.optimizers.Adam(0.001)

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# Initialize metrics for tracking loss and accuracy
class TrainingMetrics:
    def __init__(self):
        self._loss_values = []
        self._accuracy_values = []
        
    def update_loss(self, loss):
        self._loss_values.append(float(loss))
        
    def update_accuracy(self, real, pred):
        # Calculate accuracy from sparse categorical data
        predictions = tf.argmax(pred, axis=-1)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(real, predictions), tf.float32))
        self._accuracy_values.append(float(accuracy))
        
    def result_loss(self):
        return sum(self._loss_values) / max(len(self._loss_values), 1)
        
    def result_accuracy(self):
        return sum(self._accuracy_values) / max(len(self._accuracy_values), 1)
        
    def reset(self):
        self._loss_values = []
        self._accuracy_values = []

# Create training metrics
train_metrics = TrainingMetrics()

# Training step
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar_inp)[1])
    
    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp=inp, 
            tar=tar_inp, 
            training=True, 
            enc_padding_mask=None, 
            look_ahead_mask=look_ahead_mask, 
            dec_padding_mask=None
        )
        loss = loss_function(tar_real, predictions)
    
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    return loss, tar_real, predictions

### Function to generate text

In [None]:
# Generate text
def generate_text(model, start_string, num_generate=1000, temperature=1.0):
    # Convert start string to indices
    input_indices = text_to_indices(start_string)
    input_tensor = tf.expand_dims(input_indices, 0)
    
    # Empty result string
    result = start_string
    
    for i in range(num_generate):
        # Create look-ahead mask
        look_ahead_mask = create_look_ahead_mask(tf.shape(input_tensor)[1])
        
        # Call model with keyword arguments
        output, _ = model(
            inp=input_tensor, 
            tar=input_tensor, 
            training=False, 
            enc_padding_mask=None,
            look_ahead_mask=look_ahead_mask,
            dec_padding_mask=None
        )
        
        # Select the last token from the output
        output = output[:, -1, :]  # (batch_size, vocab_size)
        
        # Apply temperature
        if temperature != 1.0:
            output = output / temperature
        
        # Sample from the output distribution
        predicted_id = tf.random.categorical(output, num_samples=1)[-1, 0].numpy()
        
        # Concatenate the predicted character to the output text
        result += idx_to_char[predicted_id]
        
        # Update the input tensor to the decoder
        input_indices.append(predicted_id)
        input_tensor = tf.expand_dims(input_indices[-max_seq_length:], 0)
    
    return result

In [None]:
# Generate text before training
print("\n----- Text generated before training -----\n")
start_string = "ROMEO: "
generated_text = generate_text(transformer, start_string, num_generate=250, temperature=1.0)
print(generated_text)

### Training

In [3]:
# Training
epochs = 30
for epoch in range(epochs):
    start = time.time()
    
    # Reset metrics for each epoch
    train_metrics.reset()
    
    for (batch, (inp, tar)) in enumerate(dataset):
        loss, real, pred = train_step(inp, tar)
        
        # Update metrics (outside the graph for better compatibility)
        train_metrics.update_loss(loss)
        train_metrics.update_accuracy(real, pred)
        
        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_metrics.result_loss():.4f} Accuracy {train_metrics.result_accuracy():.4f}')
    
    print(f'Epoch {epoch + 1} Loss {train_metrics.result_loss():.4f} Accuracy {train_metrics.result_accuracy():.4f}')
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs')
    
    # Generate text every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f"\n----- Text generated after epoch {epoch + 1} -----\n")
        start_string = "ROMEO: "
        generated_text = generate_text(transformer, start_string, num_generate=250, temperature=0.8)
        print(generated_text)
        print("\n")

# Save the model
transformer.save_weights('shakespeare_transformer/model')

# Generate final text
print("\n----- Text generated after training -----\n")
start_string = "ROMEO: "
generated_text = generate_text(transformer, start_string, num_generate=500, temperature=0.7)
print(generated_text)

Vocabulary size: 65
First 100 characters of text: First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You

----- Text generated before training -----

ROMEO: ttuPgaoPQ?.SD'bZIuhnh3OQZIZhh?LLg.FlchhPZghqYd;YbA;uQTM!.YgYtXgwgbFgQL DwgqgSgRh lCI.Clhhc!tPg?a;Hc::PijVgqBBuSQgcZZX!,gcc$V gMIehBSIZ'SZZZgZ-ggahgPZRacuOlxDQBZFqZZKHBZFt:MkxKcPStZ;tBq;ZBhVchgsSQ;SgIgIZ:iB:Lg.;uh:CLBZzQZhbVIghZZ B:;yXgclZhzhq
;!lxhq$
Epoch 1 Batch 0 Loss 4.6906 Accuracy 0.0074
Epoch 1 Batch 50 Loss 3.4642 Accuracy 0.1284
Epoch 1 Batch 100 Loss 3.3923 Accuracy 0.1373
Epoch 1 Batch 150 Loss 3.3656 Accuracy 0.1415
Epoch 1 Batch 200 Loss 3.3491 Accuracy 0.1443
Epoch 1 Batch 250 Loss 3.3378 Accuracy 0.1461
Epoch 1 Batch 300 Loss 3.3305 Accuracy 0.1474
Epoch 1 Batch 350 Loss 3.3242 Accuracy 0.1482
Epoch 1 Batch 400 Loss 3.3184 Accuracy 0.1487
Epoch 1 Batch 450 Loss 3.3142 Accuracy 0.1492
Epoch 1 Batch 500 Loss 3.3102 Accuracy 0.1496
Epoch 1 Batch 550 Loss 3.3074 Accuracy 0.14

2025-03-28 18:11:09.298152: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1 Loss 3.3032 Accuracy 0.1502
Time taken for 1 epoch: 1181.80 secs
Epoch 2 Batch 0 Loss 3.3221 Accuracy 0.1461
Epoch 2 Batch 50 Loss 3.2900 Accuracy 0.1492
Epoch 2 Batch 100 Loss 3.2872 Accuracy 0.1499
Epoch 2 Batch 150 Loss 3.2871 Accuracy 0.1506
Epoch 2 Batch 200 Loss 3.2833 Accuracy 0.1515
Epoch 2 Batch 250 Loss 3.2747 Accuracy 0.1532
Epoch 2 Batch 300 Loss 3.2720 Accuracy 0.1541
Epoch 2 Batch 350 Loss 3.2729 Accuracy 0.1538
Epoch 2 Batch 400 Loss 3.2717 Accuracy 0.1538
Epoch 2 Batch 450 Loss 3.2714 Accuracy 0.1537
Epoch 2 Batch 500 Loss 3.2716 Accuracy 0.1536
Epoch 2 Batch 550 Loss 3.2725 Accuracy 0.1534
Epoch 2 Batch 600 Loss 3.2733 Accuracy 0.1533
Epoch 2 Batch 650 Loss 3.2739 Accuracy 0.1532


2025-03-28 18:30:30.865804: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 2 Loss 3.2742 Accuracy 0.1531
Time taken for 1 epoch: 1161.57 secs
Epoch 3 Batch 0 Loss 3.2253 Accuracy 0.1547
Epoch 3 Batch 50 Loss 3.2816 Accuracy 0.1496
Epoch 3 Batch 100 Loss 3.2817 Accuracy 0.1502
Epoch 3 Batch 150 Loss 3.2814 Accuracy 0.1507
Epoch 3 Batch 200 Loss 3.2792 Accuracy 0.1514
Epoch 3 Batch 250 Loss 3.2777 Accuracy 0.1519
Epoch 3 Batch 300 Loss 3.2784 Accuracy 0.1522
Epoch 3 Batch 350 Loss 3.2789 Accuracy 0.1523
Epoch 3 Batch 400 Loss 3.2773 Accuracy 0.1524
Epoch 3 Batch 450 Loss 3.2755 Accuracy 0.1525
Epoch 3 Batch 500 Loss 3.2743 Accuracy 0.1526
Epoch 3 Batch 550 Loss 3.2751 Accuracy 0.1525
Epoch 3 Batch 600 Loss 3.2758 Accuracy 0.1524
Epoch 3 Batch 650 Loss 3.2763 Accuracy 0.1524
Epoch 3 Loss 3.2767 Accuracy 0.1523
Time taken for 1 epoch: 1147.59 secs
Epoch 4 Batch 0 Loss 3.2692 Accuracy 0.1504
Epoch 4 Batch 50 Loss 3.2850 Accuracy 0.1494
Epoch 4 Batch 100 Loss 3.2830 Accuracy 0.1502
Epoch 4 Batch 150 Loss 3.2815 Accuracy 0.1508
Epoch 4 Batch 200 Loss 3.2802 Ac

2025-03-28 19:08:55.180180: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 4 Loss 3.2763 Accuracy 0.1523
Time taken for 1 epoch: 1156.72 secs
Epoch 5 Batch 0 Loss 3.2701 Accuracy 0.1449
Epoch 5 Batch 50 Loss 3.2822 Accuracy 0.1498
Epoch 5 Batch 100 Loss 3.2818 Accuracy 0.1504
Epoch 5 Batch 150 Loss 3.2840 Accuracy 0.1509
Epoch 5 Batch 200 Loss 3.2819 Accuracy 0.1514
Epoch 5 Batch 250 Loss 3.2788 Accuracy 0.1518
Epoch 5 Batch 300 Loss 3.2792 Accuracy 0.1520
Epoch 5 Batch 350 Loss 3.2787 Accuracy 0.1522
Epoch 5 Batch 400 Loss 3.2774 Accuracy 0.1523
Epoch 5 Batch 450 Loss 3.2753 Accuracy 0.1524
Epoch 5 Batch 500 Loss 3.2752 Accuracy 0.1524
Epoch 5 Batch 550 Loss 3.2749 Accuracy 0.1525
Epoch 5 Batch 600 Loss 3.2750 Accuracy 0.1524
Epoch 5 Batch 650 Loss 3.2756 Accuracy 0.1524
Epoch 5 Loss 3.2759 Accuracy 0.1523
Time taken for 1 epoch: 1163.28 secs

----- Text generated after epoch 5 -----

ROMEO: tsble  f ekl wrr hrrhi o e sudaehtlkea : ehf ceouet:oeflos   n o  n ently dd eITO tr oo erras e  o  mr erwt s aene e aa  l eitooiotlnyreldldta ormi  mOE oan e.easw

KeyboardInterrupt: 