In [1]:
# !pip install dill
# !pip install ktext

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
import random
import math
import time

import unicodedata
import string
import re

from pathlib import Path
import dill as dpickle

from ktext.preprocess import processor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Using TensorFlow backend.


cuda


In [3]:
def read_training_files(data_path:str):
    """
    Read data from directory
    """
    PATH = Path(data_path)

    with open(PATH/'train.function', 'r') as f:
        t_enc = f.readlines()

    with open(PATH/'valid.function', 'r') as f:
        v_enc = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_enc = t_enc + v_enc

    with open(PATH/'test.function', 'r') as f:
        h_enc = f.readlines()

    with open(PATH/'train.docstring', 'r') as f:
        t_dec = f.readlines()

    with open(PATH/'valid.docstring', 'r') as f:
        v_dec = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_dec = t_dec + v_dec

    with open(PATH/'test.docstring', 'r') as f:
        h_dec = f.readlines()

    return tv_enc, h_enc, tv_dec, h_dec

In [4]:
train_code, holdout_code, train_docstring, holdout_docstring = read_training_files('processed_data/')

In [5]:
train_code[0]

'def batch_generator batch_size data labels None n_batches int np ceil len data float batch_size idx np random permutation len data data_shuffled data idx if labels is not None labels_shuffled labels idx for i in range n_batches start i batch_size end start batch_size if labels is not None yield data_shuffled start end labels_shuffled start end else yield data_shuffled start end\n'

In [6]:
train_docstring[0]

'"generates batches of samples : param data : array - like , shape = ( n_samples , n_features ) : param labels : array - like , shape = ( n_samples , ) : return :"\n'

In [7]:
class Encoder(nn.Module):
    def __init__(self, code_vocab_size, emb_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(code_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5, num_layers=2).to(device)

    def forward(self, input):
        embedded = self.embedding(input)
        _, hidden_state = self.gru(embedded)
        return hidden_state

In [8]:
class Decoder(nn.Module):
    def __init__(self, docstring_vocab_size, emb_dim, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(docstring_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5, num_layers=2).to(device)
        self.linear = nn.Linear(hidden_size, docstring_vocab_size)

    def forward(self, input, initial_state):
        embedded = self.embedding(input)
        output, _ = self.gru(embedded, initial_state)
        output = self.linear(output)
        return F.softmax(output)

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, code, docstring):
        dec_initial_state = self.encoder(code)
        return self.decoder(docstring, dec_initial_state)

In [10]:
def load_text_processor(fname='title_pp.dpkl'):
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    num_tokens = max(pp.id2token.keys()) + 1
    return num_tokens, pp

In [11]:
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
    vectorized_title = np.load(decoder_np_vecs)
    decoder_input_data = vectorized_title[:, :-1]
    decoder_target_data = vectorized_title[:, 1:]
    return decoder_input_data, decoder_target_data

In [12]:
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
    vectorized_body = np.load(encoder_np_vecs)
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    return encoder_input_data, doc_length

In [13]:
encoder_input_data, encoder_seq_len = load_encoder_inputs('seq2seq/py_train_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('seq2seq/py_train_docstring_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor('seq2seq/py_code_processor_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor('seq2seq/py_docstring_processor_v2.dpkl')

In [14]:
model = Seq2Seq(Encoder(num_encoder_tokens, emb_dim=800, hidden_size=1024),
                Decoder(num_decoder_tokens, emb_dim=800, hidden_size=1024)).to(device)

In [15]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20002, 800)
    (gru): GRU(800, 1024, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(14002, 800)
    (gru): GRU(800, 1024, num_layers=2, dropout=0.5)
    (linear): Linear(in_features=1024, out_features=14002, bias=True)
  )
)

In [16]:
encoder_input_data = torch.LongTensor(encoder_input_data).to(device)

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 65,369,394 trainable parameters


In [18]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [19]:
BATCH_SIZE = 1000

train_code_loader = torch.utils.data.DataLoader(encoder_input_data, batch_size=1000, shuffle=True)
code_data_iter = iter(train_code_loader)
train_docstring_loader = torch.utils.data.DataLoader(decoder_input_data, batch_size=1000, shuffle=True)
docstring_data_iter = iter(train_docstring_loader)

In [20]:
for i, code in enumerate(train_code_loader, 0):
    if i > 1:
        break
    docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]
    code = code.type(torch.LongTensor).to(device)
    docstrings = torch.LongTensor(docstrings).to(device)
    print(docstrings.shape)

torch.Size([1000, 14])
torch.Size([1000, 14])


In [22]:
#model.train()

for epoch in range(18):

    running_loss = 0.0
    for i, code in enumerate(train_code_loader, 0):
        dec_input = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]
        dec_target = decoder_target_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]

        optimizer.zero_grad()

        code = code.type(torch.LongTensor).to(device)
        dec_input = torch.LongTensor(dec_input).to(device)
        dec_target = torch.LongTensor(dec_target).to(device)
        
        code = torch.transpose(code, 0, 1)
        dec_input = torch.transpose(dec_input, 0, 1)
        dec_target = torch.transpose(dec_target, 0, 1)
        outputs = model(code, dec_input)
        outputs = outputs.view(-1, outputs.shape[2])
        dec_target = dec_target.reshape(-1)
        loss = criterion(outputs, dec_target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 10 == 9:
            print("=", end='')
        if i % 300 == 299:    # print every 300 mini-batches
            print('[Epoch %d, minibatch %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 300))
            running_loss = 0.0

print('Finished Training')

  if sys.path[0] == '':


==Finished Training


In [24]:
torch.save(model.state_dict(), 'tut2-model.pt')

In [None]:
model.load_state_dict(torch.load('tut2-model.pt'))

In [25]:
sub_code = holdout_code[:20]
sub_docstring = holdout_docstring[:20]

In [26]:
sub_code = enc_pp.transform(sub_code)
sub_docstring = dec_pp.transform(sub_docstring)

In [27]:
sub_code = torch.transpose(torch.LongTensor(sub_code).to(device), 0, 1)
sub_docstring = torch.transpose(torch.LongTensor(sub_docstring).to(device), 0, 1)

In [48]:
outputs = model(sub_code, sub_docstring)

  if sys.path[0] == '':


In [49]:
outputs = np.argmax(outputs.to('cpu').detach().numpy(), axis=2).T
outputs

array([[   13,     4,  4557,    16,    20,   167,     3,     0,     0,
            0,     0,     0,     0,     0,     0],
       [  760,    11,     4,  6280,   159,    17,   497,   467,   358,
           94,     3,     0,     0,     0,     0],
       [   31,     4,     1,   416,   340,     7,     5,   161,     3,
            0,     0,     0,     0,     0,     0],
       [   13,     5,  1859,  3963,  1317,    66,  2583,   948,    28,
            3,    14,     0,     0,     0,     0],
       [  293,    47,   233,    42,   721,     4,   561,   167,    88,
            3,     0,     0,     0,     0,     0],
       [   13,     4,  4557,    16,    20,   167,     3,     0,     0,
            0,     0,     0,     0,     0,     0],
       [  760,  1109,    65,    11,    34,   609,   111,     3,     0,
            0,     0,     0,     0,     0,     0],
       [  293,     4,   233,   308,     6,    20,  1044,     3,   841,
            0,     0,     0,     0,     0,     0],
       [   22,   129,   

In [65]:
comments = []
for i in range(outputs.shape[0]):
    curr_sentence = []
    for j in range(outputs.shape[1]):
        if outputs[i][j] > 1:
            curr_sentence.append(dec_pp.id2token[outputs[i][j]])
    comments.append(curr_sentence)

In [69]:
for comment in comments:
    print(' '.join(comment))

return the identities from an item _end_
todo in the rich event with our selected types d _end_
get the events corresponding to a module _end_
return a sorting hat identity using pip auto data _end_ this
calculate user fields are inside the global item dict _end_
return the identities from an item _end_
todo null values in all django parameters _end_
calculate the fields property of an issue _end_ outputs
test whether json items are properly found into es _end_
test whether the raw index is correctly _end_
test instances with _end_ passe
test instances with objects _end_ vl
test refresh identities passe testing
test refresh project field for all sources _end_
test the extraction of params from an url _end_
get elasticsearch mapping _end_ testing easy l activate
file with nodes renaming mapping _end_ the
return the identities from an item _end_
calculate a jenkins job name converting logs dictionary _end_ upgrade
get elasticsearch mapping _end_ arch l l
