In [20]:
!pip install dill
!pip install ktext

Collecting ktext
  Using cached ktext-0.40-py3-none-any.whl (9.1 kB)
Collecting keras==2.2.4
  Using cached Keras-2.2.4-py2.py3-none-any.whl (312 kB)
Processing ./.cache/pip/wheels/8b/d9/18/1ffaf26dc8375974075b3f58c683054babdf0a0d59d90557b2/pathos-0.2.6-py3-none-any.whl
Collecting tensorflow
  Using cached tensorflow-2.3.1-cp36-cp36m-manylinux2010_x86_64.whl (320.4 MB)
Collecting textacy==0.6.2
  Using cached textacy-0.6.2-py2.py3-none-any.whl (142 kB)
Collecting pyarrow
  Using cached pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7 MB)
Collecting msgpack-numpy
  Using cached msgpack_numpy-0.4.7.1-py2.py3-none-any.whl (6.7 kB)
Collecting keras-preprocessing>=1.0.5
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting keras-applications>=1.0.6
  Using cached Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
Processing ./.cache/pip/wheels/fc/13/89/f55a8959ba9e54898fc2f6436e8f8326a5a8fb40f1c71c1d62/ppft-1.6.6.2-py3-none-any.whl
Processing ./.cache/pip/wh

Collecting requests-oauthlib>=0.7.0
  Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting pyasn1-modules>=0.2.1
  Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
Collecting rsa<5,>=3.1.4; python_version >= "3.5"
  Using cached rsa-4.6-py3-none-any.whl (47 kB)
Collecting oauthlib>=3.0.0
  Using cached oauthlib-3.1.0-py2.py3-none-any.whl (147 kB)
Collecting pyasn1<0.5.0,>=0.4.6
  Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
Installing collected packages: keras-preprocessing, keras-applications, keras, ppft, multiprocess, pox, pathos, tensorflow-estimator, astunparse, grpcio, opt-einsum, absl-py, gast, google-pasta, termcolor, oauthlib, requests-oauthlib, pyasn1, pyasn1-modules, cachetools, rsa, google-auth, google-auth-oauthlib, markdown, tensorboard-plugin-wit, tensorboard, tensorflow, pyemd, ftfy, ijson, python-levenshtein, pyphen, unidecode, textacy, pyarrow, msgpack-numpy, ktext
Successfully installed absl-py-0.11.0 astunparse-1.6.3 

In [21]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
import random
import math
import time

import unicodedata
import string
import re

from pathlib import Path
import dill as dpickle

from ktext.preprocess import processor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Using TensorFlow backend.


cuda


In [2]:
def read_training_files(data_path:str):
    """
    Read data from directory
    """
    PATH = Path(data_path)

    with open(PATH/'train.function', 'r') as f:
        t_enc = f.readlines()

    with open(PATH/'valid.function', 'r') as f:
        v_enc = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_enc = t_enc + v_enc

    with open(PATH/'test.function', 'r') as f:
        h_enc = f.readlines()

    with open(PATH/'train.docstring', 'r') as f:
        t_dec = f.readlines()

    with open(PATH/'valid.docstring', 'r') as f:
        v_dec = f.readlines()

    # combine train and validation and let keras split it randomly for you
    tv_dec = t_dec + v_dec

    with open(PATH/'test.docstring', 'r') as f:
        h_dec = f.readlines()

    return tv_enc, h_enc, tv_dec, h_dec

In [3]:
train_code, holdout_code, train_docstring, holdout_docstring = read_training_files('processed_data/')

In [5]:
train_code[0]

'def batch_generator batch_size data labels None n_batches int np ceil len data float batch_size idx np random permutation len data data_shuffled data idx if labels is not None labels_shuffled labels idx for i in range n_batches start i batch_size end start batch_size if labels is not None yield data_shuffled start end labels_shuffled start end else yield data_shuffled start end\n'

In [6]:
train_docstring[0]

'"generates batches of samples : param data : array - like , shape = ( n_samples , n_features ) : param labels : array - like , shape = ( n_samples , ) : return :"\n'

In [64]:
class Encoder(nn.Module):
    def __init__(self, code_vocab_size, emb_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(code_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5).cuda()

    def forward(self, input):
        embedded = self.embedding(input)
        _, hidden_state = self.gru(embedded)
        return hidden_state

In [69]:
class Decoder(nn.Module):
    def __init__(self, docstring_vocab_size, emb_dim, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(docstring_vocab_size, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_size, dropout=0.5).cuda()
        self.linear = nn.Linear(hidden_size, docstring_vocab_size)

    def forward(self, input, initial_state):
        embedded = self.embedding(input)
        output, _ = self.gru(embedded, initial_state)
        return F.softmax(output)

In [66]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input):
        dec_initial_state = self.encoder(input)
        return self.decoder(input, dec_initial_state)

In [15]:
def load_text_processor(fname='title_pp.dpkl'):
    """
    Load preprocessors from disk.
    Parameters
    ----------
    fname: str
        file name of ktext.proccessor object
    Returns
    -------
    num_tokens : int
        size of vocabulary loaded into ktext.processor
    pp : ktext.processor
        the processor you are trying to load
    Typical Usage:
    -------------
    num_decoder_tokens, title_pp = load_text_processor(fname='title_pp.dpkl')
    num_encoder_tokens, body_pp = load_text_processor(fname='body_pp.dpkl')
    """
    # Load files from disk
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)

    num_tokens = max(pp.id2token.keys()) + 1
    print(f'Size of vocabulary for {fname}: {num_tokens:,}')
    return num_tokens, pp

In [16]:
def load_decoder_inputs(decoder_np_vecs='train_title_vecs.npy'):
    """
    Load decoder inputs.
    Parameters
    ----------
    decoder_np_vecs : str
        filename of serialized numpy.array of decoder input (issue title)
    Returns
    -------
    decoder_input_data : numpy.array
        The data fed to the decoder as input during training for teacher forcing.
        This is the same as `decoder_np_vecs` except the last position.
    decoder_target_data : numpy.array
        The data that the decoder data is trained to generate (issue title).
        Calculated by sliding `decoder_np_vecs` one position forward.
    """
    vectorized_title = np.load(decoder_np_vecs)
    # For Decoder Input, you don't need the last word as that is only for prediction
    # when we are training using Teacher Forcing.
    decoder_input_data = vectorized_title[:, :-1]

    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
    decoder_target_data = vectorized_title[:, 1:]

    print(f'Shape of decoder input: {decoder_input_data.shape}')
    print(f'Shape of decoder target: {decoder_target_data.shape}')
    return decoder_input_data, decoder_target_data

In [17]:
def load_encoder_inputs(encoder_np_vecs='train_body_vecs.npy'):
    """
    Load variables & data that are inputs to encoder.
    Parameters
    ----------
    encoder_np_vecs : str
        filename of serialized numpy.array of encoder input (issue title)
    Returns
    -------
    encoder_input_data : numpy.array
        The issue body
    doc_length : int
        The standard document length of the input for the encoder after padding
        the shape of this array will be (num_examples, doc_length)
    """
    vectorized_body = np.load(encoder_np_vecs)
    # Encoder input is simply the body of the issue text
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    print(f'Shape of encoder input: {encoder_input_data.shape}')
    return encoder_input_data, doc_length

In [22]:
encoder_input_data, encoder_seq_len = load_encoder_inputs('seq2seq/py_train_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('seq2seq/py_train_docstring_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor('seq2seq/py_code_processor_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor('seq2seq/py_docstring_processor_v2.dpkl')

Shape of encoder input: (1227989, 55)
Shape of decoder input: (1227989, 14)
Shape of decoder target: (1227989, 14)
Size of vocabulary for seq2seq/py_code_processor_v2.dpkl: 20,002
Size of vocabulary for seq2seq/py_docstring_processor_v2.dpkl: 14,002


In [70]:
model = Seq2Seq(Encoder(num_encoder_tokens, emb_dim=400, hidden_size=256),
                Decoder(num_decoder_tokens, emb_dim=400, hidden_size=256)).to(device)

RuntimeError: CUDA error: device-side assert triggered

In [36]:
encoder_input_data = torch.LongTensor(encoder_input_data).to(device)

In [54]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 18,210,802 trainable parameters


In [55]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [58]:
BATCH_SIZE = 1000

train_code_loader = torch.utils.data.DataLoader(encoder_input_data, batch_size=1000, shuffle=True)
code_data_iter = iter(train_code_loader)
train_docstring_loader = torch.utils.data.DataLoader(decoder_input_data, batch_size=1000, shuffle=True)
docstring_data_iter = iter(train_docstring_loader)

In [68]:
model.train()

for epoch in range(12):

    running_loss = 0.0
    for i, code in enumerate(train_code_loader, 0):
        docstrings = decoder_input_data[i * BATCH_SIZE : (i+1) * BATCH_SIZE]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        code = code.type(torch.LongTensor).to(device)
        docstrings = torch.LongTensor(docstrings).to(device)
        
        code = torch.transpose(code, 0, 1)
        docstrings = torch.transpose(docstrings, 0, 1)
        
        outputs = model(code)
        outputs = outputs.view(-1, outputs.shape[2])
        docstrings = docstrings.reshape(-1)
        loss = criterion(outputs, docstrings)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 9:
            print("=", end='')
        if i % 300 == 299:    # print every 300 mini-batches
            print('[Epoch %d, minibatch %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 300))
            running_loss = 0.0

print('Finished Training')

AttributeError: 'tuple' object has no attribute 'cuda'