In [8]:
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import math
import time

from pathlib import Path
import dill as dpickle

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout).cuda()
        
        self.dropout = nn.Dropout(dropout).cuda()
        
    def forward(self, input):
        
        #src = [input_len, batch_size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [input_len, batch_size, emb_dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        return hidden, cell

In [3]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout).cuda()
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout).cuda()
        
    def forward(self, input, hidden, cell):
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input)).to(device)
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [4]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        print(input.shape)
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [5]:
def read_training_files(data_path:str):
    """
    Read data from directory
    """
    PATH = Path(data_path)

    with open(PATH/'train.function', 'r') as f:
        t_enc = f.readlines()

    with open(PATH/'valid.function', 'r') as f:
        v_enc = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_enc = t_enc + v_enc

    with open(PATH/'test.function', 'r') as f:
        h_enc = f.readlines()

    with open(PATH/'train.docstring', 'r') as f:
        t_dec = f.readlines()

    with open(PATH/'valid.docstring', 'r') as f:
        v_dec = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_dec = t_dec + v_dec

    with open(PATH/'test.docstring', 'r') as f:
        h_dec = f.readlines()

    return tv_enc, h_enc, tv_dec, h_dec

In [6]:
train_code, holdout_code, train_docstring, holdout_docstring = read_training_files('processed_data/')

In [7]:
#import sys
#!{sys.executable} -m pip install ktext

In [7]:
#from ktext.preprocess import processor

In [9]:
code_processor = processor(heuristic_pct_padding=.7, keep_n=20000)
train_code = code_processor.fit_transform(train_code)

docstring_processor = processor(append_indicators=True, heuristic_pct_padding=.7, keep_n=14000, padding ='post')
train_docstring = docstring_processor.fit_transform(train_docstring)

 See full histogram by insepecting the `document_length_stats` attribute.
 See full histogram by insepecting the `document_length_stats` attribute.


In [10]:
import dill as dpickle
import numpy as np

with open('seq2seq/py_code_processor_v2.dpkl', 'wb') as f:
    dpickle.dump(code_processor, f)

with open('seq2seq/py_docstring_processor_v2.dpkl', 'wb') as f:
    dpickle.dump(docstring_processor, f)

# Save the processed data
np.save('seq2seq/py_train_code_vecs_v2.npy', train_code)
np.save('seq2seq/py_train_docstring_vecs_v2.npy', train_docstring)

In [None]:
with open('seq2seq/py_code_processor_v2.dpkl', 'rb') as file:
    code_processor = pickle.load(file)
    
with open('seq2seq/py_docstring_processor_v2.dpkl', 'rb') as file:
    docstring_processor = pickle.load(file)

train_code = np.load('seq2seq/py_train_code_vecs_v2.npy')
docstring_code = np.load('seq2seq/py_docstring_code_vecs_v2.npy')

In [4]:
def load_text_processor(fname='title_pp.dpkl'):
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    num_tokens = max(pp.id2token.keys()) + 1
    return num_tokens, pp

In [5]:
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
    vectorized_title = np.load(decoder_np_vecs)
    decoder_input_data = vectorized_title[:, :-1]
    decoder_target_data = vectorized_title[:, 1:]
    return decoder_input_data, decoder_target_data

In [6]:
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
    vectorized_body = np.load(encoder_np_vecs)
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    return encoder_input_data, doc_length

In [9]:
encoder_input_data, encoder_seq_len = load_encoder_inputs('seq2seq/py_train_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('seq2seq/py_train_docstring_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor('seq2seq/py_code_processor_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor('seq2seq/py_docstring_processor_v2.dpkl')

Using TensorFlow backend.


In [11]:
INPUT_DIM = num_encoder_tokens
OUTPUT_DIM = num_decoder_tokens
ENC_EMB_DIM = 400
DEC_EMB_DIM = 400
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).cuda()

In [12]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20002, 400)
    (rnn): LSTM(400, 256, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(14002, 400)
    (rnn): LSTM(400, 256, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=256, out_features=14002, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 19,600,370 trainable parameters


In [14]:
optimizer = optim.Adam(model.parameters())

In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
BATCH_SIZE = 1000

train_code_loader = torch.utils.data.DataLoader(encoder_input_data, batch_size=1000, shuffle=True, num_workers=4)
code_data_iter = iter(train_code_loader)
train_docstring_loader = torch.utils.data.DataLoader(decoder_input_data, batch_size=1000, shuffle=True, num_workers=4)
docstring_data_iter = iter(train_docstring_loader)

In [17]:
for i, code in enumerate(train_code_loader, 0):
    if i > 1:
        break
    print(code.shape)

torch.Size([1000, 55])
torch.Size([1000, 55])


In [22]:
model.train()

for epoch in range(12):

    running_loss = 0.0
    for i, code in enumerate(train_code_loader, 0):
        docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        code = code.type(torch.LongTensor).to(device)
        docstrings = torch.LongTensor(docstrings).to(device)
        
        code = torch.transpose(code, 0, 1)
        docstrings = torch.transpose(docstrings, 0, 1)
        
        outputs = model(code, docstrings)
        outputs = outputs.view(-1, outputs.shape[2])
        docstrings = docstrings.reshape(-1)
        loss = criterion(outputs, docstrings)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:
            print("=", end='')
        if i % 300 == 299:    # print every 300 mini-batches
            print('[Epoch %d, minibatch %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 300))
            running_loss = 0.0

print('Finished Training')

==Finished Training


In [23]:
torch.save(model.state_dict(), 'tut1-model.pt')

In [23]:
model = Seq2Seq(enc, dec, device).cuda()

In [18]:
model.load_state_dict(torch.load('tut1-model.pt'))

<All keys matched successfully>

In [19]:
model.eval()
    
epoch_loss = 0

with torch.no_grad():

    for i, code in enumerate(train_code_loader, 0):

        docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]
        code = code.type(torch.LongTensor).to(device)
        code = torch.transpose(code, 0, 1)
        docstrings = torch.LongTensor(docstrings).to(device)
        docstrings = torch.transpose(docstrings, 0, 1)

        outputs = model(code, docstrings, 0) #turn off teacher forcing
        outputs = outputs.view(-1, outputs.shape[2])
        docstrings = docstrings.reshape(-1)
        loss = criterion(outputs, docstrings)
        epoch_loss += loss.item()
        print("=", end='')
        
print(epoch_loss / len(train_code_loader))

!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])
torch.Size([1000])
=!!! torch.Size([55, 1000])


Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/ubuntu/anaconda3/envs/pytorch_latest_p36/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 

# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

In [35]:
def predict(model, code):
    code_tokens = enc_pp.transform([code])
    encoding = model.encoder(code_tokens)

In [26]:
code_tokens = enc_pp.transform(holdout_code)
code_tokens = torch.LongTensor(code_tokens).to(device)
code_tokens = torch.transpose(code_tokens, 0, 1)
code_tokens.shape

torch.Size([55, 177220])

In [32]:
code_tokens_sub = code_tokens[:,:1000]
hidden, cell = model.encoder(code_tokens_sub)

In [33]:
code_tokens_sub.shape

torch.Size([55, 1000])

In [34]:
max_len = 100
decoded_sentence = []
stop_condition = False
while not stop_condition:
    preds, hidden, cell = model.decoder(code_tokens_sub, hidden, cell)

    # We are going to ignore indices 0 (padding) and indices 1 (unknown)
    # Argmax will return the integer index corresponding to the
    #  prediction + 2 b/c we chopped off first two
    pred_idx = np.argmax(preds[:, :, 2:]) + 2

    # retrieve word from index prediction
    pred_word_str = self.dec_pp.id2token[pred_idx]

    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    encoding = hidden, cell
    state_value = np.array(pred_idx).reshape(1, 1)

print(' '.join(decoded_sentence))

torch.Size([1, 55, 1000, 400])


RuntimeError: input must have 3 dimensions, got 4