I assume that you have run the following scripys, all explained in the notebook `01_Prepare_Data.ipynb`.

```bash
bash get_data.sh
```

and then 

```python
python prepare_input_files.py data/austen 'austen.txt' data/austen_clean
python prepare_input_files.py data/shakespeare/ 'shakespeare.txt' data/shakespeare_clean
python prepare_input_files.py data/scikit-learn '*.py' data/sklearn_clean
python prepare_input_files.py data/scalaz '*.scala' data/scalaz_clean

python split_ebooks.py

python train_test_split.py data/austen_clean/ 0.25
python train_test_split.py data/shakespeare_clean/ 0.25
python train_test_split.py data/sklearn_clean/ 0.25
python train_test_split.py data/scalaz_clean/ 0.25
```

If you want to run the code in this notebook from your terminal: 

```python
python train_pytorch.py models/model_pytorch data/sklearn_clean/ data/scalaz_clean
```

with all options one can pass (e.g. `--bidirectional`, `--epochs`, etc...)

Remember that we will have opened a few `txt` files (will pass 1024 sequences from the files, so at least 1024 files will be opened). Set `ulimit -n` high, for example, on terminal:

```bash
ulimit -n 10000
```

And finally remember, our objective is: given a sequence of characters, finding whether the characters correspond to python code or scala (or to Austen's books or Shakespeare). Here is where the interesting things begin and where Nadbor designed a very interesting way to feed the network, using an "infinite" sequence of characters.

Let's go with the details. Let's start with a series of helpers that will be useful to generate batches:

In [1]:
import os
import sys
import numpy as np

from joblib import dump
from text_utils import char2vec, n_chars
from random import choice
from glob import glob
from tqdm import tqdm,trange

import torch
import torch.nn  as nn
import torch.optim as optim
from torch.autograd import Variable
from torch_utils import AverageMeter, Accuracy, RNNCharTagger, BiRNNCharTagger

use_cuda = torch.cuda.is_available()

In [2]:
from random import choice

def chars_from_files(list_of_files):
    """
    open a file from list_of_files and yield the chars
    """
    while True:
        filename = choice(list_of_files)
        with open(filename, 'r') as f:
            chars = f.read()
            for c in chars:
                yield c


def splice_texts(files_a, jump_size_a, files_b, jump_size_b):
    """
    Pick code snippets from source A/B with at least length jump_size_a/b[0]
    and at most length jump_size_a/b[1] and splice them
    
    Params:
    -------
    files_a/b: list of files
    jump_size_a/b: list with two values [min_length, max_length]
    """    
    a_chars = chars_from_files(files_a)
    b_chars = chars_from_files(files_b)
    generators = [a_chars, b_chars]

    a_range = range(jump_size_a[0], jump_size_a[1])
    b_range = range(jump_size_b[0], jump_size_b[1])
    ranges = [a_range, b_range]

    source_ind = choice([0, 1])
    while True:
        jump_size = choice(ranges[source_ind])
        gen = generators[source_ind]
        for _ in range(jump_size):
            yield (gen.__next__(), source_ind)
        source_ind = 1 - source_ind


def generate_batches(files_a, jump_size_a, files_b, jump_size_b, batch_size, sample_len, return_text=False):
    """
    Bacth generator: given a batch_size, it will return a sequence of length sample_len 
    characters where characters from files_a and files_b will be spliced using splice_texts. 

    For example, we have n_chars=96, and say that we use sample_len=100. This generator will yield:
    1) X: an array of shape (1024, 100, 96) that is explained as follows:
        1024 -> batch size
        100  -> length of a sequence of characters
        96   -> one hot encoded char (96 different chars)
    2) y: an array of shape (1024, 100, 1) that is explained as follows:
        1024 -> batch size
        100  -> length of a sequence of characters
        1    -> the label of the corresponding character. In this example 0=python, 1=scala
    3) the text sequences of 100 characters (optional)
    """    

    gens = [splice_texts(files_a, jump_size_a, files_b, jump_size_b) for _ in range(batch_size)]
    while True:
        X = []
        y = []
        texts = []
        for g in gens:
            chars = []
            vecs = []
            labels = []
            for _ in range(sample_len):
                c, l = g.__next__()
                vecs.append(char2vec[c])
                labels.append([l])
                chars.append(c)
            X.append(vecs)
            y.append(labels)

            if return_text:
                texts.append(''.join(chars))

        if return_text:
            yield (np.array(X), np.array(y), texts)
        else:
            yield (np.array(X), np.array(y))

Let's go ahead and check what generate batches produces:

In [3]:
from glob import glob

dir_a = "../data/sklearn_clean/"
dir_b = "../data/scalaz_clean"
files_a = glob(os.path.join(dir_a, "train/*"))
files_b = glob(os.path.join(dir_b, "train/*"))

# using Nadbor's original settings
min_jump_size_a = 20
max_jump_size_a = 200
min_jump_size_b = 20
max_jump_size_b = 200
juma = [min_jump_size_a, max_jump_size_a]
jumb = [min_jump_size_b, max_jump_size_b]

batch_size = 1024
seq_len = 100

In [4]:
gen = generate_batches(files_a, juma, files_b, jumb, batch_size, seq_len, return_text=True)
X, y, texts = gen.__next__()

In [5]:
print(X.shape, y.shape)
print(texts[10])

(1024, 100, 96) (1024, 100, 1)
package scalaz

////
/** Abstraction over a container/context which may or may not provide a value.



Time to build the models.

I will first code a stateful, multi-layer RNN with a final `Timedistributed` linear layer. Remember that `stateful` simply means that the hidden state of the previous batch of sequences will be use as starting state for the current batch. 

Code below is overcommented for clarity. With the comments looks like a lot of coding, but if you remove them looks better :)

In [6]:
class TimeDistributed(nn.Module):
    def __init__(self, module):
        """
        No credit for me for this, I took it from here a while ago and been using it since then: 
        https://github.com/SeanNaren/deepspeech.pytorch/blob/master/model.py
        
        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
        :param module: Module to apply input to.
        """
        super(TimeDistributed, self).__init__()
        self.module = module

    def forward(self, x):
        t, n = x.size(0), x.size(1)

        # let's collapse the dimensions
        # contiguous simply returns a contiguous tensor containing the same data as self 
        x = x.contiguous().view(t * n, -1)

        # apply the module we pass when we initialize the class (in our case will be a Linear layer)
        x = self.module(x)

        # and expand dimensions
        x = x.contiguous().view(t, n, -1)
        return x

    def __repr__(self):
        # let's make it unambiguous and consistent with other pytorch layers
        tmpstr = self.__class__.__name__ + ' (\n'
        tmpstr += self.module.__repr__()
        tmpstr += ')'
        return tmpstr 
    
    
class RNNCharTagger(nn.Module):
    """
    Here is where the fun begins!

    Parameters are self explanatory
    """
    def __init__(self, lstm_layers, input_dim, out_dim, batch_size, dropout, batch_first=True):
        super(RNNCharTagger, self).__init__()
        
        
        self.lstm_layers = lstm_layers
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.dropout = dropout
        self.batch_first = batch_first
        self.batch_size = batch_size


        # LSTM layers: because we have to made every layer stateful, we need to define them one by one. 
        # Note that the dropout option in pytorch adds dropout after all but last recurrent layer. 
        # This means that if you define a one layer LSTM and add dropout, will do nothing. 
        # The solution is easy, let's manually add it after every recurrent layer
        self.lstm1 =  nn.LSTM(self.input_dim, self.out_dim, batch_first=self.batch_first)
        self.drop1 = nn.Dropout(self.dropout)

        # for every layer after the 1st one, we define the RNN and add dropout for all but last one
        for i in range(1,self.lstm_layers):
            if (i+1) < self.lstm_layers:
                setattr(self, 'lstm'+str(i+1), nn.LSTM(self.out_dim, self.out_dim, batch_first=self.batch_first))
                setattr(self, 'drop'+str(i+1), nn.Dropout(self.dropout))
            else:
                setattr(self, 'lstm'+str(i+1), nn.LSTM(self.out_dim, self.out_dim, batch_first=self.batch_first))

        # The Timedistributed layer after the RNN layers
        self.linear = TimeDistributed(nn.Linear(self.out_dim, 1))

        # Initialize cell states
        for i in range(self.lstm_layers):

            # one could also initialize as zeros.
            # setattr(self, 'h'+str(i+1), nn.Parameter(torch.zeros(1, self.batch_size, self.out_dim)))
            # setattr(self, 'c'+str(i+1), nn.Parameter(torch.zeros(1, self.batch_size, self.out_dim)))
            setattr(self, 'h'+str(i+1), nn.Parameter(nn.init.normal_(torch.Tensor(1, self.batch_size, self.out_dim))))
            setattr(self, 'c'+str(i+1), nn.Parameter(nn.init.normal_(torch.Tensor(1, self.batch_size, self.out_dim))))

    def forward(self, X):

        # in the first forward pass we will use the initialized cell state and store 
        # the output state per LSTM layer. If we used just oned layer, this would be
        # easier:  

        # output, self.hidden = self.lstm(X, (self.hidden))

        # Because we use multiple layers and we want the code to be readable, we store
        # the cell states in a list and update at the end
        output, (h1, c1) = self.lstm1(X, (self.h1, self.c1))
        output = self.drop1(output)
        hidden_states = [(h1,c1)]
        for i in range(1,self.lstm_layers):
            h,c = getattr(self, 'h'+str(i+1)), getattr(self, 'c'+str(i+1))
            output, (nh,nc) = getattr(self, 'lstm'+str(i+1))(output, (h,c))
            if (i+1) < self.lstm_layers:
                output = getattr(self, 'drop'+str(i+1))(output)
            hidden_states.append((nh,nc))

        for i in range(self.lstm_layers):
            setattr(self, 'h'+str(i+1), nn.Parameter(hidden_states[i][0].data))
            setattr(self, 'c'+str(i+1), nn.Parameter(hidden_states[i][1].data))

        # Finally, sigmoid on the output to classify as python or scala 
        output = torch.sigmoid(self.linear(output))

        return output   

During the process, Nadbor realised that some of the misclassifications raised from the fact that the RNN *"can only interpret a character in the context of characters that came before"*. In other words, a Bidirectional LSTM, where sequences are fed from both ends will potentially solve the issue. For more details around this discussion, I again recommend reading his post. When using bidirectional LSTMs it does not make much sense to use a stateful RNN, which is easy to understand. Simply explained, we are feeding the network from both ends, so half of the state is not directly related (meaning naturally following a sequence) to the previous state.

After the RNN code above, coding a `Bidirectional` LSTMs looks pretty straightforward.

In [7]:
class BiRNNCharTagger(nn.Module):
    def __init__(self, lstm_layers, input_dim, out_dim, batch_size, dropout, batch_first=True):
        super(BiRNNCharTagger, self).__init__()

        self.lstm_layers = lstm_layers
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.dropout = dropout
        self.batch_first = batch_first
        self.batch_size = batch_size

        self.lstm =  nn.LSTM(
            self.input_dim,
            self.out_dim,
            batch_first=self.batch_first,
            dropout=self.dropout,
            num_layers = self.lstm_layers,
            bidirectional=True)
        self.linear = TimeDistributed(nn.Linear(2*self.out_dim, 1))

    def forward(self, X):
        lstm_output, hidden = self.lstm(X)
        output = torch.sigmoid(self.linear(lstm_output))
        return output

As simple as that! 

Let's define a train function

These can be simple functions as the ones in the cell below (again, overcommented)

In [8]:
def train(train_gen, model, criterion, optimizer, epoch, steps_per_epoch):
    """
    Params:
    -----------
    train_gen: train generator
    model    : pytorch model
    criterion: loss function
    optimizer: your favourite optimizer
    epoch    : integer indicating the current epoch
    steps_per_epoch: how many steps will define an epoch
    """
    
    # switch to train mode
    model.train()

    # we will use tqdm for pretty progressbars
    with trange(steps_per_epoch) as t:
        for i in t:
            t.set_description('epoch %i' % epoch)

            # 1. Generate X and y and move them to cuda if use_cuda
            X,y = train_gen.__next__()
            X = torch.from_numpy(X).float()
            y = torch.from_numpy(y).float()
            if use_cuda:
                X, y = X.cuda(), y.cuda()

            # 2. Pytorch accumulates gradients. We need to clear them after each step
            optimizer.zero_grad()

            # 3. Run the forward pass
            y_pred = model(X)
            
            # 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()            
            loss = criterion(y_pred, y)
            # if using previous torch versions
            # t.set_postfix(loss=loss.data[0])            
            t.set_postfix(loss=loss.item())
            loss.backward()
            optimizer.step()

Before defining the validation function, which is going to be nearly identical to `train`, let me define a couple of helpers. In pytorch there is not such a thing as an *"accuracy metric"*, or not that I know, but is very easy to code one. In addition, after the validation steps, we would like to see the mean of the validation metrics after each step. 

With that in mind, let's define two functions: 

In [9]:
class AverageMeter(object):
    """Computes and stores the average and current value
    from here: https://github.com/SeanNaren/deepspeech.pytorch/blob/master/model.py
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class Accuracy(nn.Module):
    """
    wrapper to compute accuracy
    """
    def __init__(self):
        super(Accuracy, self).__init__()

    def forward(self,y_pred,y):
        y_pred = (y_pred.view(-1, 1) > 0.5).data.float()
        y = y.view(-1, 1).data.float()
        # if using previous torch versions
        # acc = (y_pred == y).sum()/y.size(0)
        acc = (y_pred == y).sum().item()/y.size(0)
        return Variable(torch.FloatTensor([acc]))

    def __repr__(self):
        return self.__class__.__name__ + '(\n)'

And now, the validation stage, which is identical to `train`, with the exception we do not clean gradients or update parameters, we simple *"predict"* and compute validation metrics:

In [10]:
def validate(val_gen, model, metrics, validation_steps):

    # switch to evaluate mode
    model.eval()

    losses = []
    for i in range(len(metrics)):
        losses.append(AverageMeter())

    with torch.no_grad():
        with trange(validation_steps) as t:
            for i in t:
                t.set_description('validating')
                X,y = val_gen.__next__()
                X = torch.from_numpy(X).float()
                y = torch.from_numpy(y).float()
                if use_cuda:
                    X, y = X.cuda(), y.cuda()
                y_pred = model(X)
                for i in range(len(metrics)):
                    # if using previous torch versions
                    # losses[i].update(metrics[i](y_pred, y).data[0])
                    losses[i].update(metrics[i](y_pred, y).item())

            for metric,loss in zip(metrics, losses):
                print("val_{}: {}".format(metric.__repr__().split("(")[0], loss.val))

Now we can train. Let's 1st just define the model.

In [11]:
lstm_layers = 3
input_dim = n_chars
out_dim = 128
batch_size = 1024
dropout = 0.2

model = RNNCharTagger(lstm_layers, input_dim, out_dim, batch_size, dropout)
use_cuda = torch.cuda.is_available()
if use_cuda:
    model = model.cuda()
print(model)

RNNCharTagger(
  (lstm1): LSTM(96, 128, batch_first=True)
  (drop1): Dropout(p=0.2, inplace=False)
  (lstm2): LSTM(128, 128, batch_first=True)
  (drop2): Dropout(p=0.2, inplace=False)
  (lstm3): LSTM(128, 128, batch_first=True)
  (linear): TimeDistributed (
  Linear(in_features=128, out_features=1, bias=True))
)


In [12]:
dir_a = "../data/sklearn_clean/"
dir_b = "../data/scalaz_clean/"

# training and validation files
train_a = glob(os.path.join(dir_a, "train/*"))
train_b = glob(os.path.join(dir_b, "train/*"))
val_a = glob(os.path.join(dir_a, "test/*"))
val_b = glob(os.path.join(dir_b, "test/*"))

# sequences of less than 200 and more than 20 will be spliced together
min_jump_size_a = 20
min_jump_size_b = 20
max_jump_size_a = 200
max_jump_size_b = 200
juma = [min_jump_size_a, max_jump_size_a]
jumb = [min_jump_size_b, max_jump_size_b]

# length of the resulting sequence that will be passed to the RNN model
seq_len = 100

# start the generators
train_gen = generate_batches(train_a, juma, train_b, jumb, batch_size, seq_len)
val_gen = generate_batches(val_a, juma, val_b, jumb, batch_size, seq_len)

# set training and validation parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
metrics = [nn.MSELoss(), nn.BCELoss(), Accuracy()]
epochs = 5
steps_per_epoch = 100
validation_steps = 50    
for epoch in range(1,epochs+1):
    train(train_gen, model, criterion, optimizer, epoch, steps_per_epoch)
    validate(val_gen, model, metrics, validation_steps)

epoch 1: 100%|██████████| 100/100 [00:40<00:00,  2.45it/s, loss=0.117]
validating: 100%|██████████| 50/50 [00:21<00:00,  2.38it/s]
epoch 2:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.1320791393518448
val_BCELoss: 0.4182758033275604
val_Accuracy: 0.8195605278015137


epoch 2: 100%|██████████| 100/100 [00:42<00:00,  2.37it/s, loss=0.0685]
validating: 100%|██████████| 50/50 [00:21<00:00,  2.35it/s]
epoch 3:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.0686672031879425
val_BCELoss: 0.24117805063724518
val_Accuracy: 0.9116015434265137


epoch 3: 100%|██████████| 100/100 [00:42<00:00,  2.36it/s, loss=0.0549]
validating: 100%|██████████| 50/50 [00:21<00:00,  2.34it/s]
epoch 4:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.05160655081272125
val_BCELoss: 0.1886126548051834
val_Accuracy: 0.9353417754173279


epoch 4: 100%|██████████| 100/100 [00:42<00:00,  2.35it/s, loss=0.0466]
validating: 100%|██████████| 50/50 [00:21<00:00,  2.34it/s]
epoch 5:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.04274223744869232
val_BCELoss: 0.15919990837574005
val_Accuracy: 0.9464550614356995


epoch 5: 100%|██████████| 100/100 [00:43<00:00,  2.28it/s, loss=0.0373]
validating: 100%|██████████| 50/50 [00:22<00:00,  2.25it/s]

val_MSELoss: 0.03736407682299614
val_BCELoss: 0.14223158359527588
val_Accuracy: 0.9540429711341858





Let's now try with Bidirectional LSTMs. 

In [13]:
del(model)
model = BiRNNCharTagger(lstm_layers,n_chars,out_dim,batch_size,dropout)
use_cuda = torch.cuda.is_available()
if use_cuda:
    model = model.cuda()
print(model)

BiRNNCharTagger(
  (lstm): LSTM(96, 128, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (linear): TimeDistributed (
  Linear(in_features=256, out_features=1, bias=True))
)


In [14]:
# start the generators
train_gen = generate_batches(train_a, juma, train_b, jumb, batch_size, seq_len)
val_gen = generate_batches(val_a, juma, val_b, jumb, batch_size, seq_len)

# set training and validation parameters
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 5
for epoch in range(1,epochs+1):
    train(train_gen, model, criterion, optimizer, epoch, steps_per_epoch)
    validate(val_gen, model, metrics, validation_steps)

epoch 1: 100%|██████████| 100/100 [00:56<00:00,  1.77it/s, loss=0.0815]
validating: 100%|██████████| 50/50 [00:22<00:00,  2.23it/s]
epoch 2:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.08015353977680206
val_BCELoss: 0.26529833674430847
val_Accuracy: 0.8909960985183716


epoch 2: 100%|██████████| 100/100 [00:56<00:00,  1.77it/s, loss=0.0458]
validating: 100%|██████████| 50/50 [00:22<00:00,  2.25it/s]
epoch 3:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.04603021591901779
val_BCELoss: 0.15985704958438873
val_Accuracy: 0.9394824504852295


epoch 3: 100%|██████████| 100/100 [00:56<00:00,  1.77it/s, loss=0.0359]
validating: 100%|██████████| 50/50 [00:23<00:00,  2.17it/s]
epoch 4:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.035017021000385284
val_BCELoss: 0.12092449516057968
val_Accuracy: 0.952832043170929


epoch 4: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s, loss=0.0267]
validating: 100%|██████████| 50/50 [00:22<00:00,  2.18it/s]
epoch 5:   0%|          | 0/100 [00:00<?, ?it/s]

val_MSELoss: 0.02851518616080284
val_BCELoss: 0.10061416029930115
val_Accuracy: 0.9622167944908142


epoch 5: 100%|██████████| 100/100 [00:56<00:00,  1.76it/s, loss=0.0224]
validating: 100%|██████████| 50/50 [00:22<00:00,  2.25it/s]

val_MSELoss: 0.023322539404034615
val_BCELoss: 0.08188345283269882
val_Accuracy: 0.9694140553474426





Nice!

Let's save the model and plot the results

In [15]:
model_path = "../models/model_pytorch"
MODEL_DIR = model_path.split("/")[0]
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
torch.save(model.state_dict(), model_path)

The only thing left to do is tagging characters and plotting.

Tagging...

In [16]:
model = BiRNNCharTagger(lstm_layers,n_chars,out_dim,batch_size,dropout)
model.load_state_dict(torch.load(model_path))
model = model.cuda()
model.eval()

gen = generate_batches(val_a, juma, val_b, jumb, batch_size, seq_len, return_text=True)
steps = 50

# 1. Store the predictions, labels and corresponding text
predictions, labels, texts = [],[],[]
with trange(steps) as t:
    for i in t:
        X,y,text = gen.__next__()
        X_var = Variable(torch.from_numpy(X).float())
        y_var = Variable(torch.from_numpy(y).float())
        if use_cuda:
            X_var, y_var = X_var.cuda(), y_var.cuda()
        pr = model(X_var)
        predictions.append(pr.data)
        labels.append(y_var.data)
        texts.append(text)

preds = torch.cat(predictions,dim=1).reshape(batch_size,steps*seq_len)
preds = preds.cpu().numpy()
labs = torch.cat(labels,dim=1).reshape(batch_size,steps*seq_len)
labs = labs.cpu().numpy()
txts = []
for j in range(batch_size):
    txts.append("".join([texts[i][j] for i in range(steps)]))


output_dir = "output/sklearn_or_scala_preds_pytorch"    
try:
    os.makedirs(output_dir)
except os.error:
    pass
for i in range(batch_size):
    path = os.path.join(output_dir, 'part_' + str(i).zfill(5) + ".joblib")
    dump((txts[i], preds[i], labs[i]), path)

100%|██████████| 50/50 [00:18<00:00,  2.75it/s]


And finally plotting...

In [17]:
import matplotlib
import matplotlib.pyplot as plt

from joblib import load
%matplotlib inline

# just temporarily copied this module to the notebooks dir so it runs
from plot_predictions import prediction_to_html

predictions_dir = "output/sklearn_or_scala_preds_pytorch" 
output_dir = "output/sklearn_or_scala_preds_pytorch_html"

try:
    os.makedirs(output_dir)
except os.error:
    pass
files = glob(os.path.join(predictions_dir, "*"))
for i, f in enumerate(files[100:110]):
    text, prediction, labels = load(f)
    html = prediction_to_html(text, prediction, labels, cmap="Reds")
    out_path = os.path.join(output_dir, 'part-' + str(i).zfill(5) + ".html")
    with open(out_path, "w") as out:
        out.write(html)

In [18]:
from IPython.display import display, HTML
display(HTML(filename="output/sklearn_or_scala_preds_pytorch_html/part-00004.html"))