In [1]:
import pandas as pd
import torch
import time
import numpy as np
import warnings
from gensim.models.word2vec import Word2Vec
from model import BatchProgramCC
from torch.autograd import Variable
from sklearn.metrics import precision_recall_fscore_support
warnings.filterwarnings('ignore')
import sys
import argparse
import matplotlib.pyplot as plt
from my_utilities import get_batch, eval_model_baseline



In [2]:
mode = "trainBCBtestSCB" # "trainSCBtestBCB"

In [4]:
margin = 50

In [5]:
root = 'data/'
lang = 'java'

In [6]:
data_scb = pd.read_pickle(root+lang+'/scb/blocks.pickle').sample(frac=1)

In [7]:
data_scb['label'] = 1 - data_scb['label']

In [8]:
word2vec = Word2Vec.load(root+lang+"/scb/embedding/node_w2v_128").wv
MAX_TOKENS = word2vec.vectors.shape[0]
EMBEDDING_DIM = word2vec.vectors.shape[1]
embeddings = np.zeros((MAX_TOKENS + 1, EMBEDDING_DIM), dtype="float32")
embeddings[:word2vec.vectors.shape[0]] = word2vec.vectors

In [9]:
HIDDEN_DIM = 100
ENCODE_DIM = 128
LABELS = 1
EPOCHS = 5
BATCH_SIZE = 32
USE_GPU = torch.cuda.is_available()

In [10]:
data_scb.groupby('label').size()

label
0    19523
1    19533
dtype: int64

In [11]:
# Initialize model
model = BatchProgramCC(EMBEDDING_DIM,HIDDEN_DIM,MAX_TOKENS+1,ENCODE_DIM,LABELS,BATCH_SIZE,
                            USE_GPU, embeddings)
if USE_GPU:
    model.cuda()

parameters = model.parameters()
#optimizer = torch.optim.Adamax(parameters, lr=0.0001)
optimizer = torch.optim.Adam(parameters, lr=1e-3)
loss_function = torch.nn.BCELoss()

In [13]:
if mode == "trainBCBtestSCB":
    data_train = data_scb[~data_scb['functionality_id'].isna()]
    
    data_test = data_scb[data_scb['functionality_id'].isna()]
else:
    data_train = data_scb[data_scb['functionality_id'].isna()]
    
    data_test = data_scb[~data_scb['functionality_id'].isna()]

In [14]:
len(data_train)

37062

In [15]:
len(data_test)

1994

In [16]:
prev_epoch_f1 = 0

In [None]:
for epoch in range(5):
    loss_arr_train = []
    i = 0
    while i < len(data_train):
        model.train()
        batch = get_batch(data_train, i, BATCH_SIZE)
        train1_inputs, train2_inputs, train_labels = batch
        if USE_GPU:
            train1_inputs, train2_inputs, train_labels = train1_inputs, train2_inputs, train_labels.cuda()
    
        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
    
        output = model(train1_inputs, train2_inputs)
    
        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()
        i += BATCH_SIZE

    
    f, similarity_scores = eval_model_baseline(model, data_test, BATCH_SIZE, USE_GPU)

    if f<prev_epoch_f1:
        print("Lower F1 than previous epoch. Early stopping...")
        sys.stdout.flush()
        break
    else:
        prev_epoch_f1 = f