In [21]:
import javalang
import json
import os

import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [22]:
# import pytorch modules
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.optimizer import Optimizer

In [23]:
# seed so results are reproducible
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

In [30]:
def split_data(data, ratio, seed=2023):    
    data_num = len(data)
    ratios = [int(r) for r in ratio.split(':')]
    train_split = int(ratios[0]/sum(ratios)*data_num)
    val_split = train_split + int(ratios[1]/sum(ratios)*data_num)

    data = data.sample(frac=1, random_state=seed)
    train_pairs = data.iloc[:train_split]
    dev_pairs = data.iloc[train_split:val_split]
    test_pairs = data.iloc[val_split:]

    return train_pairs, dev_pairs, test_pairs

def extract_pair(self):
    data_list = []
    confidence_map = {0: 0.6, 1: 0.8, 2: 1}
    tree_df = self.tree_ds
    id_list = list(tree_df['id'].values)        
    for json_dict in self.dataset:
        accumulate = 0
        total_weight = 0
        for field in ['goals', 'operations', 'effects']:
            for rating_object in json_dict[field]:
                if rating_object['rating'] != -1:
                    accumulate += rating_object['rating'] * confidence_map[rating_object['confidence']]
                    total_weight += confidence_map[rating_object['confidence']]
        score = round(accumulate / total_weight)
        data_ins = [
            int(json_dict['first']['dbid']),
            int(json_dict['second']['dbid']),
            score
        ]

        if data_ins[0] in id_list and data_ins[1] in id_list:
            data_list.append(data_ins)
    self.pair_ds = pd.DataFrame(data_list, columns=['id1', 'id2', 'label'])

    # print("---->", self.pair_ds.shape)
    # print("---->", self.pair_ds.head())
    return self.pair_ds

In [33]:
# read training data 
DATA_PATH = '../data/SeSaMe_VersionHistory.json'

with open(DATA_PATH, 'r', encoding='utf-8') as input_file:
    data = json.load(input_file)
    input_file.close()
mydf = pd.DataFrame(data)

# split into validation & train sets & test set
data_train, data_val, data_test= split_data(mydf, '8:1:1')

In [35]:
data_train.head()

Unnamed: 0,pairid,first,second,goals,operations,effects
612,650,"{'dbid': '8643', 'project': 'deeplearning4j', ...","{'dbid': '36559', 'project': 'openjdk11', 'fil...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ..."
41,43,"{'dbid': '14803', 'project': 'freemind', 'file...","{'dbid': '45364', 'project': 'openjdk11', 'fil...","[{'rating': 2, 'confidence': 2}, {'rating': -1...","[{'rating': 2, 'confidence': 2}, {'rating': -1...","[{'rating': 2, 'confidence': 2}, {'rating': -1..."
773,814,"{'dbid': '1965', 'project': 'commons-lang', 'f...","{'dbid': '25762', 'project': 'openjdk11', 'fil...","[{'rating': 2, 'confidence': 2}, {'rating': -1...","[{'rating': 1, 'confidence': 1}, {'rating': -1...","[{'rating': 1, 'confidence': 1}, {'rating': -1..."
638,676,"{'dbid': '7547', 'project': 'deeplearning4j', ...","{'dbid': '15044', 'project': 'freemind', 'file...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ..."
370,395,"{'dbid': '1383', 'project': 'commons-collectio...","{'dbid': '43746', 'project': 'openjdk11', 'fil...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ...","[{'rating': -1, 'confidence': -1}, {'rating': ..."


In [34]:
data_train.loc[data_train['label'] > 0, 'label'] = 1
data_val.loc[data_val['label'] > 0, 'label'] = 1
data_test.loc[data_test['label'] > 0, 'label'] = 1

print(data_train.shape)
print(data_val.shape)
print(data_test.shape)

KeyError: 'label'

In [None]:
data_train = pd.read_pickle(DATA_DIR + '/train_blocks.pkl').sample(frac=1, random_state=RANDOM_SEED)
data_val = pd.read_pickle(DATA_DIR + '/dev_blocks.pkl').sample(frac=1, random_state=RANDOM_SEED)
data_test = pd.read_pickle(DATA_DIR + '/test_blocks.pkl').sample(frac=1, random_state=RANDOM_SEED)




In [19]:
# check proportion of positive examples in train and val set
print(np.sum(train_data.label) / len(train_data))
print(np.sum(test_data.label) / len(test_data))

0.31478770131771594
0.37209302325581395


In [17]:
# !rm -rf split_data
# !mkdir split_data
data_train.to_csv(DATA_DIR + '/split_data/train.csv', index=False)
data_val.to_csv(DATA_DIR + '/split_data/val.csv', index=False)
data_test.to_csv(DATA_DIR+ '/split_data/test.csv', index=False)
# del data_train, data_val, data_test 

In [20]:
data_train.head(3)

Unnamed: 0,id1,id2,label,code_x,code_versions_x,code_y,code_versions_y
497,11855,36123,0,"[[25, [18, [28], [114]], [469]], [4, [8, [0, [...","[[[25, [18, [28], [114]], [469]], [4, [8, [0, ...","[[25, [18, [173]], [2972]], [45, [140, [46, [3...","[[[25, [18, [173]], [2972]], [45, [140, [46, [..."
295,2356,33190,1,"[[25, [18, [28], [114]], [14, [86]], [1973], [...","[[[25, [18, [28], [114]], [14, [86]], [1973], ...","[[25, [18, [28], [114]], [14, [86]], [1973], [...","[[[25, [18, [28], [114]], [14, [86]], [1973], ..."
589,9876,20631,0,"[[25, [18, [28]], [2972], [23, [3, [2964]], [2...","[[[25, [18, [28]], [2972], [23, [3, [2964]], [...","[[25, [18, [44], [28]], [2972]], [297], [6, [1...","[[[25, [18, [44], [28]], [2972]], [297], [6, [..."


In [None]:
# initialize torchtext Field objects 
text = torchtext.data.Field(lower=True, batch_first=True, tokenize='spacy', include_lengths=True)
target = torchtext.data.Field(sequential=False, use_vocab=False, is_target=True)
qid = torchtext.data.Field()
# use field objects to read training, validation and test sets
train = torchtext.data.TabularDataset(path=DATA_DIR + '/split_data/train.csv', format='csv',
                                      fields={'code_x': ('text',text),
                                              'target': ('target',target)})
val = torchtext.data.TabularDataset(path=DATA_DIR + '/split_data/val.csv', format='csv',
                                    fields={'question_text': ('text',text),
                                              'target': ('target',target)})
test = torchtext.data.TabularDataset(path=DATA_DIR + '/split_data/test.csv', format='csv',
                                     fields={'qid': ('qid', qid),
                                             'question_text': ('text',text)})