In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
from collections import namedtuple
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch
import torch.nn.functional as F
from torch import FloatTensor, LongTensor
import math
from tqdm import tqdm
#from google.colab import drive

#drive.mount('/content/gdrive')
#%cd gdrive/'My Drive'/ml/code2vec/

In [2]:
X = np.load('X(code2seq).npy')
y = np.load('y(code2seq).npy')
X_ = np.load('X(code2seq)2.npy')
y_ = np.load('y(code2seq)2.npy')
X = np.concatenate((X, X_), axis = 0)
y = np.concatenate((y, y_), axis = 0)

In [3]:
for n in range(X.shape[0]):
    for i in range(X.shape[1]):
        if np.sum(X[n][i]) == X.shape[2]:
            y[n][i] = 2

In [None]:
def baseline(labels):
    count = 0
    for label in labels:
        if label[0] == 1:
            count += 1
    return count / len(labels)

In [None]:
baseline(labels)

0.3333333333333333

In [4]:
train_x, train_y = X[:int(0.8*X.shape[0])], y[:int(0.8*y.shape[0])]
test_x, test_y = X[int(0.8*X.shape[0]):], y[int(0.8*y.shape[0]):]
#lstm_model.fit(train_x, train_y, batch_size=1, epochs=20)

In [5]:
def iterate_batches(data, batch_size):
    X, y = data
    n_samples = len(X)

    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        
        batch_indices = indices[start:end]
        X_batch = X[batch_indices]
        y_batch = y[batch_indices]
            
        yield X_batch, y_batch

In [6]:
X_batch, y_batch = next(iterate_batches((train_x, train_y), 4))
X_batch.shape, y_batch.shape

((4, 80, 320), (4, 80))

In [7]:
class LSTMTagger(nn.Module):
    def __init__(self, word_emb_dim=384, lstm_hidden_dim=32, lstm_layers_count=1):
        super().__init__()
        self.word_emb_dim = word_emb_dim

        self.word_emb = nn.Embedding(2, word_emb_dim)
        self.lstm = nn.LSTM(word_emb_dim, lstm_hidden_dim,
                            num_layers=lstm_layers_count, bidirectional=True)
        self.tagger = nn.Linear(2 *  lstm_hidden_dim, 2)

    def forward(self, inputs):
        emb = self.word_emb(torch.LongTensor([[0]]))
        inputs[np.where(~inputs.detach().numpy().any(axis=2))] = emb
        emb = self.word_emb(torch.LongTensor([[1]]))
        inputs[np.where(np.sum(inputs.detach().numpy(), axis=2)==self.word_emb_dim)] = emb

        res, _ = self.lstm(inputs)
        tag = self.tagger(res)
        return F.softmax(tag, 1)
    
    

In [8]:
model = LSTMTagger(
    word_emb_dim = 320,
    lstm_hidden_dim=40
)

X_batch, y_batch = torch.FloatTensor(train_x[:4]), torch.LongTensor(train_y[:4])
print(X_batch.shape)
logits = model(X_batch)
mask = (y_batch == 1)
float(torch.sum((torch.argmax(logits, 2) == y_batch) * mask)) / float(torch.sum(mask))

torch.Size([4, 80, 320])


0.0

In [9]:
def accuracy(y_true, y_pred):
    c = 0
    for n in range(y_pred.shape[0]):
        pred = []
        for i in range(y_pred.shape[1]):
            if y_pred[n][i][0] > y_pred[n][i][1]:
                pred.append(0)
            else:
                pred.append(y_pred[n][i][1])
        if (np.argmax(pred) == np.argmax(y_true[n])):
            c += 1
    return c/y_true.shape[0]
accuracy(y_batch, logits)

0.0

In [10]:
criterion = nn.CrossEntropyLoss(ignore_index=2)
criterion(logits.transpose(2, 1), y_batch)

tensor(0.6932, grad_fn=<NllLoss2DBackward>)

In [11]:
def do_epoch(model, criterion, data, batch_size, optimizer=None, name=None):
    epoch_loss = 0
    correct_count = 0
    sum_count = 0
    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    batches_count = math.ceil(len(data[0]) / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batches_count) as progress_bar:
            for i, (X_batch, y_batch) in enumerate(iterate_batches(data, batch_size)):
                X_batch, y_batch = FloatTensor(X_batch), LongTensor(y_batch)
                logits = model(X_batch)

                loss = criterion(logits.transpose(2, 1), y_batch)

                epoch_loss += loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                correct_count += accuracy(y_batch, logits) * batch_size
                sum_count += batch_size

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, Accuracy = {:.2%}'.format(
                    name, loss.item(), correct_count / sum_count)
                )
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, Accuracy = {:.2%}'.format(
                name, epoch_loss / batches_count, correct_count / sum_count)
            )

    return epoch_loss / batches_count, correct_count / sum_count


def fit(model, criterion, optimizer, train_data, epochs_count=1, batch_size=32,
        val_data=None, val_batch_size=None):
        
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
    all_train_acc = []
    all_val_acc = []
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss, train_acc = do_epoch(model, criterion, train_data, batch_size, optimizer, name_prefix + 'Train:')
        
        if not val_data is None:
            val_loss, val_acc = do_epoch(model, criterion, val_data, val_batch_size, None, name_prefix + '  Val:')
        all_train_acc.append(train_acc)
        all_val_acc.append(val_acc)
    return np.array(all_train_acc).max(), np.array(all_val_acc).min()

In [13]:
model = LSTMTagger(
    word_emb_dim = 320,
    lstm_hidden_dim=40
)
criterion = nn.CrossEntropyLoss(ignore_index=2).cuda()
lrs = [1e-1, 1e-2, 1e-3, 1e-4]
epochs = [10, 15]
RunResult = namedtuple("RunResult", ['model', 'train_history', 'val_history'])
Parameters = namedtuple("Parameters", ['lr', 'epoch', 'optim', 'anneal_coef', 'anneal_epoch'])
anneal_coeff = 0.1
anneal_epochs = [5, 10]
optims = optim.Adam
run_record = []
train_parameters = []
for anneal_epoch in anneal_epochs:
    for lr in lrs:
        for epoch in epochs:
            train_parameters.append(Parameters(lr, epoch, optims, anneal_coeff, anneal_epoch))
            loss = nn.CrossEntropyLoss().cuda()
            #model.to('cuda')
            optimizer = optims(model.parameters(), lr=lr)
            scheduler = StepLR(optimizer, step_size = anneal_epoch, gamma = anneal_coeff)
            train_acc, val_acc = fit(model, criterion, optimizer, train_data=(train_x, train_y), epochs_count=epoch,
                batch_size=4, val_data=(test_x, test_y), val_batch_size=4)
            run_record.append(RunResult(model, train_acc, val_acc))
        

[1 / 10] Train: Loss = 0.69031, Accuracy = 32.68%: 100%|██████████| 824/824 [00:28<00:00, 28.92it/s]
[1 / 10]   Val: Loss = 0.69057, Accuracy = 30.70%: 100%|██████████| 206/206 [00:03<00:00, 62.74it/s]
[2 / 10] Train: Loss = 0.69026, Accuracy = 34.01%: 100%|██████████| 824/824 [00:31<00:00, 25.84it/s]
[2 / 10]   Val: Loss = 0.69073, Accuracy = 29.13%: 100%|██████████| 206/206 [00:03<00:00, 62.70it/s]
[3 / 10] Train: Loss = 0.69047, Accuracy = 31.52%: 100%|██████████| 824/824 [00:33<00:00, 24.47it/s]
[3 / 10]   Val: Loss = 0.69069, Accuracy = 29.85%: 100%|██████████| 206/206 [00:04<00:00, 47.36it/s]
[4 / 10] Train: Loss = 0.69034, Accuracy = 33.89%: 100%|██████████| 824/824 [00:37<00:00, 22.16it/s]
[4 / 10]   Val: Loss = 0.69051, Accuracy = 32.04%: 100%|██████████| 206/206 [00:04<00:00, 49.54it/s]
[5 / 10] Train: Loss = 0.69025, Accuracy = 36.07%: 100%|██████████| 824/824 [01:06<00:00, 12.45it/s]
[5 / 10]   Val: Loss = 0.69089, Accuracy = 34.59%: 100%|██████████| 206/206 [00:03<00:00, 5

[6 / 15] Train: Loss = 0.68964, Accuracy = 39.99%: 100%|██████████| 824/824 [00:39<00:00, 20.98it/s]
[6 / 15]   Val: Loss = 0.68986, Accuracy = 37.74%: 100%|██████████| 206/206 [00:04<00:00, 49.21it/s]
[7 / 15] Train: Loss = 0.68593, Accuracy = 39.84%:  41%|████      | 337/824 [00:14<00:20, 23.87it/s]


KeyboardInterrupt: 

In [23]:
import json

results = []
for record in run_record:
    results.append({'train_acc':record.train_history, 'val_acc':record.val_history})

In [24]:
f = open('results.txt', 'w')
json.dump(results, f, indent=4)
f.close()
