In [1]:
# !pip install dill
# !pip install ktext

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
import random
import math
import time

import unicodedata
import string
import re

from pathlib import Path
import dill as dpickle

from ktext.preprocess import processor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
def read_training_files(data_path:str):
    """
    Read data from directory
    """
    PATH = Path(data_path)

    with open(PATH/'train.function', 'r') as f:
        t_enc = f.readlines()

    with open(PATH/'valid.function', 'r') as f:
        v_enc = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_enc = t_enc + v_enc

    with open(PATH/'test.function', 'r') as f:
        h_enc = f.readlines()

    with open(PATH/'train.docstring', 'r') as f:
        t_dec = f.readlines()

    with open(PATH/'valid.docstring', 'r') as f:
        v_dec = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_dec = t_dec + v_dec

    with open(PATH/'test.docstring', 'r') as f:
        h_dec = f.readlines()

    return tv_enc, h_enc, tv_dec, h_dec

In [4]:
train_code, holdout_code, train_docstring, holdout_docstring = read_training_files('processed_data/')

In [5]:
train_code[0]

'def batch_generator batch_size data labels None n_batches int np ceil len data float batch_size idx np random permutation len data data_shuffled data idx if labels is not None labels_shuffled labels idx for i in range n_batches start i batch_size end start batch_size if labels is not None yield data_shuffled start end labels_shuffled start end else yield data_shuffled start end\n'

In [6]:
train_docstring[0]

'"generates batches of samples : param data : array - like , shape = ( n_samples , n_features ) : param labels : array - like , shape = ( n_samples , ) : return :"\n'

In [19]:
class Encoder(nn.Module):
    def __init__(self, code_vocab_size, emb_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(code_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5).to(device)

    def forward(self, input):
        embedded = self.embedding(input)
        _, hidden_state = self.gru(embedded)
        return hidden_state

In [65]:
class Decoder(nn.Module):
    def __init__(self, docstring_vocab_size, emb_dim, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(docstring_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5).to(device)
        self.linear = nn.Linear(hidden_size, docstring_vocab_size)

    def forward(self, input, initial_state):
        embedded = self.embedding(input)
        output, _ = self.gru(embedded, initial_state)
        output = self.linear(output)
        return F.softmax(output)

In [45]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, code, docstring):
        dec_initial_state = self.encoder(code)
        return self.decoder(docstring, dec_initial_state)

In [14]:
def load_text_processor(fname='title_pp.dpkl'):
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    num_tokens = max(pp.id2token.keys()) + 1
    return num_tokens, pp

In [15]:
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
    vectorized_title = np.load(decoder_np_vecs)
    decoder_input_data = vectorized_title[:, :-1]
    decoder_target_data = vectorized_title[:, 1:]
    return decoder_input_data, decoder_target_data

In [16]:
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
    vectorized_body = np.load(encoder_np_vecs)
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    return encoder_input_data, doc_length

In [17]:
encoder_input_data, encoder_seq_len = load_encoder_inputs('seq2seq/py_train_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('seq2seq/py_train_docstring_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor('seq2seq/py_code_processor_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor('seq2seq/py_docstring_processor_v2.dpkl')

In [56]:
model = Seq2Seq(Encoder(num_encoder_tokens, emb_dim=400, hidden_size=256),
                Decoder(num_decoder_tokens, emb_dim=400, hidden_size=256)).to(device)

In [47]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20002, 400)
    (gru): GRU(400, 256, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(14002, 400)
    (gru): GRU(400, 256, dropout=0.5)
    (linear): Linear(in_features=256, out_features=14002, bias=True)
  )
)

In [48]:
encoder_input_data = torch.LongTensor(encoder_input_data).to(device)

In [49]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 18,210,802 trainable parameters


In [50]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [51]:
BATCH_SIZE = 1000

train_code_loader = torch.utils.data.DataLoader(encoder_input_data, batch_size=1000, shuffle=True)
code_data_iter = iter(train_code_loader)
train_docstring_loader = torch.utils.data.DataLoader(decoder_input_data, batch_size=1000, shuffle=True)
docstring_data_iter = iter(train_docstring_loader)

In [52]:
for i, code in enumerate(train_code_loader, 0):
    if i > 1:
        break
    docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]
    code = code.type(torch.LongTensor).to(device)
    docstrings = torch.LongTensor(docstrings).to(device)
    print(docstrings.shape)

torch.Size([1000, 14])
torch.Size([1000, 14])


In [111]:
model.train()

for epoch in range(12):

    running_loss = 0.0
    for i, code in enumerate(train_code_loader, 0):
        docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        code = code.type(torch.LongTensor).to(device)
        docstrings = torch.LongTensor(docstrings).to(device)
        
        code = torch.transpose(code, 0, 1)
        docstrings = torch.transpose(docstrings, 0, 1)
        outputs = model(code, docstrings)
        outputs = outputs.view(-1, outputs.shape[2])
        docstrings = docstrings.reshape(-1)
        loss = criterion(outputs, docstrings)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:
            print("=", end='')
        if i % 300 == 299:    # print every 300 mini-batches
            print('[Epoch %d, minibatch %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 300))
            running_loss = 0.0

print('Finished Training')

  if sys.path[0] == '':


torch.Size([14000, 14002])
torch.Size([14000])


KeyboardInterrupt: 