RNN for calculating perplexity (based on https://github.com/LeanManager/NLP-PyTorch/blob/master/Character-Level%20LSTM%20with%20PyTorch.ipynb)

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F

# Create df to store training results for each corpus 
rnn_results_df = pd.DataFrame(columns=['epoch', 'corpus', 'val_loss', 'val_perplexity'])

In [None]:
# One hot encode
def one_hot_encode(arr, n_labels):
    # Initialize the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Reshape it back to original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

# Batches
def get_batches(arr, n_seqs, n_steps):
    '''Create a generator that returns batches of size n_seqs x n_steps from arr.
    
        Args
        ----
        arr: Array to make batches from
        n_seqs: Batch size, number of sequences per batch
        n_steps: Number of sequence steps per batch
    '''
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough chars for full batches
    arr = arr[:n_batches * batch_size]
    
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    for n in range(0, arr.shape[1], n_steps):
        # Features
        x = arr[:, n:n+n_steps]
        # Targets, shifted by one
        y = np.zeros_like(x)
        
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_steps]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [None]:
class CharRNN(nn.Module):
    def __init__(self, tokens, n_steps=100, n_hidden=256, n_layers=2, 
                 drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        # Creating char dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
        self.init_weights()
        
        
    def forward(self, x, hc):
        ''' Forward pass. Inputs are `x` and hidden/cell state are `hc`. '''
        # Get x, and the new hidden state (h, c) from the lstm
        x, (h, c) = self.lstm(x, hc)
        # Pass x through the dropout layer
        x = self.dropout(x)
        # Reshape lstm outputs
        x = x.reshape(x.size()[0]*x.size()[1], self.n_hidden)
        # Put x through the full-connected layer
        x = self.fc(x)
        
        # Return x and the hidden state (h, c)
        return x, (h, c)
    
    
    def predict(self, device, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns predicted character and hidden state.
        '''
        self.to(device) 
        if h is None:
            h = self.init_hidden(1)
            
        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        inputs = torch.from_numpy(x).to(device)
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)
        p = F.softmax(out, dim=1).data.cpu()
        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        return self.int2char[char], h
    
    def init_weights(self):
        ''' Initialize weights for fc layer '''
        initrange = 0.1
        
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)
        
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers * n_seqs * n_hidden,
        # initialized to zero, for hidden state and cell state of lstm
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())
        

class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False        

In [None]:
# Train the model (without Cross Validation)
def train(lang, net, device, data, epochs=10, n_seqs=128, n_steps=100, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network
    
        Args
        ----
        lang: Current corpus
        net: CharRNN network
        device: CPU or GPU (cuda)
        data: text data to train on
        epochs: Number of epochs to train
        n_seqs: Number of sequences per batch (batch size)
        n_steps: Number of character steps per batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Validation data as fraction
        print_every: Number of steps for printing training/validation loss
    '''
    global rnn_results_df
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    net.to(device) 
    counter = 0
    n_chars = len(net.chars)
    # early_stopper = EarlyStopper(patience=1, min_delta=0)
    for e in range(epochs):
        h = net.init_hidden(n_seqs)
        flag = False
        for x, y in get_batches(data, n_seqs, n_steps):
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)
            h = tuple([each.data for each in h])
            net.zero_grad()
            output, h = net.forward(inputs, h)
            loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.LongTensor).to(device))
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            if counter % print_every == 0:
                val_h = net.init_hidden(n_seqs)
                val_losses = []
                perplexities = []
                for x, y in get_batches(val_data, n_seqs, n_steps):
                    x = one_hot_encode(x, n_chars)
                    inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)
                    val_h = tuple([each.data for each in val_h])
                    output, val_h = net.forward(inputs, val_h)
                    val_loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.LongTensor).to(device))
                    val_perplexity = torch.exp(val_loss)
                    val_losses.append(val_loss.item())
                    perplexities.append(val_perplexity.item())
        # if early_stopper.early_stop(val_loss):             
        #     break

                val_loss = np.mean(val_losses)
                val_perplexity = np.mean(perplexities)    
                if not flag:
                    print("Epoch: {}/{}...".format(e+1, epochs),
                        "Loss: {:.4f}...".format(loss.item()),
                        "Val Loss: {:.4f}...".format(val_loss),
                        "Val Perplexity: {:.4f}".format(val_perplexity))

                    temp = pd.DataFrame({
                        'epoch': e+1,
                        'corpus': lang,
                        'val_loss': val_loss,
                        'val_perplexity': val_perplexity
                    }, index=[0])
                    rnn_results_df = pd.concat([rnn_results_df, temp], ignore_index=True)
                    flag = True


# Hyperparams
n_seqs = 128
n_steps = 100
epochs = 20
lr = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
maindir = './corpora/current_corpora/'

for file in os.listdir(maindir):
  if file.endswith('.txt'):
    lang = os.path.splitext(os.path.basename(file))[0].split('_')[0]
    with open(os.path.join(maindir, file), 'r', encoding='utf-8') as f:
      text = f.read()
      f.close() 

    print(f'Starting {file}...')
    chars = tuple(set(text))
    int2char = dict(enumerate(chars))
    char2int = {ch: ii for ii, ch in int2char.items()}
    encoded = np.array([char2int[ch] for ch in text])
    net = CharRNN(chars, n_hidden=512, n_layers=2)
    train(lang, net, device, encoded, epochs=epochs, n_seqs=n_seqs, n_steps=n_steps, lr=lr, clip=5)

In [None]:
df = pd.read_csv('./rnn_results', sep='\t')

In [45]:
# # Grid search for optimal params
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import make_scorer
# from sklearn.metrics import accuracy_score

# param_dist = {
#     'hidden_size': [64, 128, 256],
#     'num_layers': [1, 2],
#     'dropout': [0.2, 0.5],
#     'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2]
# }


# with open('./current_corpora/en_wiki_extractor.txt', 'r', encoding='utf-8') as f:
#     content = f.read()
# chars = tuple(set(content))
# int2char = dict(enumerate(chars))
# char2int = {ch: ii for ii, ch in int2char.items()}
# data = np.array([char2int[ch] for ch in content])
# LSTMModel = CharRNN(chars, n_hidden=512, n_layers=2)

# X_batches = []
# y_batches = []

# for x, y in get_batches(data, n_seqs=128, n_steps=100):
#     x = one_hot_encode(x, len(chars))
#     X_batches.append(x)
#     y_batches.append(y)
    
# X_train = np.concatenate(X_batches)
# y_train = np.concatenate(y_batches)
# y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
# print(X_train.shape, y_train.shape)

# # scorer = make_scorer(accuracy_score)

# # random_search = RandomizedSearchCV(
# #     LSTMModel, param_distributions=param_dist, n_iter=100, scoring=scorer, cv=3
# # )

# # random_search.fit(X_train, y_train)
# # best_params = random_search.best_params_

(39808, 100, 28) (39808, 100, 1)


In [None]:
# # Train the model (with Cross Validation)        
# def train(net, device, data, n_folds=5, epochs=10, n_seqs=10, n_steps=50, lr=0.001, clip=5):
#     ''' Training a network
    
#         Args
#         ----
#         net: CharRNN network
#         device: CPU or GPU (cuda)
#         data: text data to train on
#         n_folds: Number of folds for cross-validation
#         epochs: Number of epochs to train
#         n_seqs: Number of sequences per batch (batch size)
#         n_steps: Number of character steps per batch
#         lr: learning rate
#         clip: gradient clipping
#     '''
    
#     net.train()
    
#     opt = torch.optim.Adam(net.parameters(), lr=lr)
#     criterion = nn.CrossEntropyLoss()
#     kf = KFold(n_splits=n_folds, shuffle=True)
#     n_chars = len(net.chars)
    
#     net.to(device)
        
#     for fold, (train_idx, val_idx) in enumerate(kf.split(data)):
        
#         train_data, val_data = data[train_idx], data[val_idx]
    
#         for e in range(epochs):
#             h = net.init_hidden(n_seqs)
#             flag = False
          
#             for x, y in get_batches(train_data, n_seqs, n_steps):
              
#                 # One hot encode and convert to Torch tensors
#                 x = one_hot_encode(x, n_chars)
#                 inputs, targets = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device)

#                 # Create new variables for the hidden state to avoid backprop
#                 # through the entire training history
#                 h = tuple([each.data for each in h])

#                 net.zero_grad()
#                 output, h = net.forward(inputs, h)
#                 loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.LongTensor))

#                 loss.backward()
                
#                 # `clip_grad_norm` to avoid exploding gradient
#                 nn.utils.clip_grad_norm_(net.parameters(), clip)

#                 opt.step()
              
#                 if not flag:
#                     print("Epoch: {}/{}...".format(e+1, epochs),
#                         "Loss: {:.4f}...".format(loss.item()))
#                     flag = True
          
#         # Evaluate on validation data
#         with torch.no_grad():
#             val_losses = []
#             val_h = net.init_hidden(n_seqs)
#             net.eval()
            
#             for x, y in get_batches(val_data, n_seqs, n_steps):
#                 x = one_hot_encode(x, len(net.chars))
#                 x, y = torch.from_numpy(x), torch.from_numpy(y)
#                 val_h = tuple([each.data for each in val_h])
#                 inputs, targets = x.to(device), y.to(device)
                
#                 output, val_h = net.forward(inputs, val_h)
#                 val_loss = criterion(output, targets.view(n_seqs * n_steps).type(torch.LongTensor))
#                 val_losses.append(val_loss.item())
                
#             avg_val_loss = np.mean(val_losses)
            
#             print("Fold: {}/{}...".format(fold + 1, n_folds), 
#                 "Validation Loss: {:.4f}...".format(avg_val_loss))        


# # Train with Cross Validation

# # Hyperparams
# n_seqs = 64
# n_steps = 50
# n_folds = 5
# epochs = 10
# lr = 0.001
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# net = CharRNN(chars, n_hidden=512, n_layers=2)

# train(net, device, encoded, n_folds=n_folds, epochs=epochs, n_seqs=n_seqs, n_steps=n_steps, lr=lr, clip=5)

In [None]:
# # Train with Cross Validation

# # Hyperparams
# n_seqs = 64
# n_steps = 50
# n_folds = 5
# epochs = 10
# lr = 0.001
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# train(net, device, encoded, n_folds=n_folds, epochs=epochs, n_seqs=n_seqs, n_steps=n_steps, lr=lr, clip=5)

In [None]:
# # Save the model

# model_name = 'char_rnn.net'

# checkpoint = {'n_hidden': net.n_hidden,
#               'n_layers': net.n_layers,
#               'state_dict': net.state_dict(),
#               'tokens': net.chars}

# with open(model_name, 'wb') as f:
#     torch.save(checkpoint, f)

In [None]:
# Test/Sample model

# def sample(net, device, size, prime='The', top_k=None):
     
#     net.to(device)
#     net.eval()
    
#     # Run through the prime chars
#     chars = [ch for ch in prime]
    
#     h = net.init_hidden(1)
#     for ch in prime:
#         char, h = net.predict(device, ch, h, top_k=top_k)

#     chars.append(char)
    
#     # Pass prev char and get new one
#     for ii in range(size):
#         char, h = net.predict(device, chars[-1], h, top_k=top_k)
#         chars.append(char)

#     return ''.join(chars)

In [None]:
# Load a trained model

# with open('modelname.net', 'rb') as f:
#     checkpoint = torch.load(f)
    
# loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
# loaded.load_state_dict(checkpoint['state_dict'])

# print(sample(loaded, device, 2000, top_k=5, prime="Example text"))