In [1]:
## implementation & testing --> v1

import torch
import torch.nn as nn
import numpy as np
from torch.nn import functional as F
torch.manual_seed(1337)

print('IMPORTS DONE')

IMPORTS DONE


In [2]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available() or torch.backends.mps.is_available()
if train_on_gpu:
    device = 'cuda' if torch.cuda.is_available() else 'mps'

print(device)

cuda


## LSTM w/ Embedding Layer
##### The script is without embedding (uses one-hot encoding)

In [3]:
## helper functions ##
def get_data(path):

    # open text file and read in data as `text`
    with open(path, 'r') as f:
        text = f.read()

    return text

def encode(text):
    # encode the text and map each character to an integer and vice versa

    # we create two dictionaries:
    # 1. int2char, which maps integers to characters
    # 2. char2int, which maps characters to unique integers
    chars = tuple(set(text))
    int2char = dict(enumerate(chars))
    char2int = {ch: ii for ii, ch in int2char.items()}

    # encode the text
    encoded = np.array([char2int[ch] for ch in text])

    return encoded, chars
    
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [4]:
## network ##
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001,
                               device='cpu', 
                               train_on_gpu=False,
                               embedding_dim=32):
        super().__init__()
        self.train_on_gpu = train_on_gpu
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        ## Define embedding
        self.embedding = nn.Embedding(len(self.chars), embedding_dim)

        ## Define LSTM
        self.lstm = nn.LSTM(embedding_dim, n_hidden, n_layers, dropout=drop_prob,
                            batch_first = True)
        
        ## Define dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## Define fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))

    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''

        x = self.embedding(x)
                
        ## Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)

        ## Pass through dropout layer
        out = self.dropout(r_output)

        ## Stack up LSTM outputs using reshape
        out = out.reshape(-1, self.n_hidden)

        ## Put x through fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (self.train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [5]:
def train(net, data, epochs=10, batch_size=10, 
            seq_length=50, lr=0.001, 
            clip=5, val_frac=0.1, 
            print_every=10,
            train_on_gpu=False,
            device='cpu'):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.to(device)
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.to(device), targets.to(device)

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.to(device), targets.to(device)

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))


In [6]:
## hyper-params ##
batch_size = 128
seq_length = 100
n_epochs = 20
n_hidden=512
n_layers=2

text = get_data('/content/input.txt')
encoded, chars = encode(text)

net = CharRNN(tokens=chars, n_hidden=n_hidden, n_layers=n_layers, 
                train_on_gpu=train_on_gpu, device=device)

# train the model
train(net, encoded, 
    epochs=n_epochs, 
    batch_size=batch_size, 
    seq_length=seq_length, 
    lr=0.001, print_every=10,
    train_on_gpu=train_on_gpu, 
    device=device)

Epoch: 1/20... Step: 10... Loss: 3.3458... Val Loss: 3.3501
Epoch: 1/20... Step: 20... Loss: 3.2579... Val Loss: 3.2852
Epoch: 1/20... Step: 30... Loss: 3.1619... Val Loss: 3.1290
Epoch: 1/20... Step: 40... Loss: 3.0204... Val Loss: 2.9455
Epoch: 1/20... Step: 50... Loss: 2.8338... Val Loss: 2.7739
Epoch: 1/20... Step: 60... Loss: 2.6944... Val Loss: 2.6284
Epoch: 1/20... Step: 70... Loss: 2.5432... Val Loss: 2.5173
Epoch: 2/20... Step: 80... Loss: 2.4872... Val Loss: 2.4183
Epoch: 2/20... Step: 90... Loss: 2.4275... Val Loss: 2.3512
Epoch: 2/20... Step: 100... Loss: 2.3643... Val Loss: 2.2948
Epoch: 2/20... Step: 110... Loss: 2.3097... Val Loss: 2.2415
Epoch: 2/20... Step: 120... Loss: 2.2735... Val Loss: 2.2150
Epoch: 2/20... Step: 130... Loss: 2.2362... Val Loss: 2.1763
Epoch: 2/20... Step: 140... Loss: 2.1955... Val Loss: 2.1394
Epoch: 2/20... Step: 150... Loss: 2.1452... Val Loss: 2.1126
Epoch: 3/20... Step: 160... Loss: 2.1271... Val Loss: 2.0719
Epoch: 3/20... Step: 170... Loss: