In [1]:
import pandas as pd
import numpy as np
import os
import javalang

In [2]:
root = 'data/'
language = 'gcj/'

In [3]:
def parse_source(output_file):
    path = root+language+'/'+output_file
    def parse_program(func):
        tokens = javalang.tokenizer.tokenize(func)
        tree = javalang.parser.parse(tokens)
        return tree
    source = {'id':[], 'code':[]}
    for id in os.listdir('../dataset/gcj_merge/code/'):
        source['id'].append(int(id.replace('.txt', '')))
        with open('../dataset/gcj_merge/code/'+id, 'r') as f:
            s = f.read()
        try:
            tree = parse_program(s)
        except:
            tree = None
        source['code'].append(tree)

    source = pd.DataFrame(source)
    source.to_pickle(path)
    return source

In [4]:
source = parse_source(output_file='ast.pkl')

In [5]:
source = source.dropna()
source.shape

(3330, 2)

In [10]:
source.shape

(3330, 2)

In [6]:
def dictionary_and_embedding(size):
    data_path = root+language

    trees = source   # 个人更改
    if not os.path.exists(data_path+'embedding'):
        os.mkdir(data_path+'embedding')

    from utils import get_sequence as func

    def trans_to_sequences(ast):
        sequence = []
        func(ast, sequence)
        return sequence
    corpus = trees['code'].apply(trans_to_sequences)
    # str_corpus = [' '.join(c) for c in corpus]
    # trees['code'] = pd.Series(str_corpus)
    # trees.to_csv(data_path+'train/programs_ns.tsv')

    from gensim.models.word2vec import Word2Vec
    w2v = Word2Vec(corpus, size=size, workers=16, sg=1, max_final_vocab=3000)
    w2v.save(data_path+'embedding/node_w2v_' + str(size))

In [7]:
dictionary_and_embedding(128)

In [8]:
def generate_block_seqs(source, size):
    from utils import get_blocks_v1 as func
    from gensim.models.word2vec import Word2Vec

    word2vec = Word2Vec.load(root+language+'embedding/node_w2v_' + str(size)).wv
    vocab = word2vec.vocab
    max_token = word2vec.syn0.shape[0]

    def tree_to_index(node):
        token = node.token
        result = [vocab[token].index if token in vocab else max_token]
        children = node.children
        for child in children:
            result.append(tree_to_index(child))
        return result

    def trans2seq(r):
        blocks = []
        func(r, blocks)
        tree = []
        for b in blocks:
            btree = tree_to_index(b)
            tree.append(btree)
        return tree
    trees = pd.DataFrame(source, copy=True)
    codes = []
    for _, row in trees.iterrows():
        # print(row['id'])
        # if row['id'] != 6933:
        #     continue
        codes.append(trans2seq(row['code']))
    trees['code'] = codes
    if 'label' in trees.columns:
        trees.drop('label', axis=1, inplace=True)
    return trees

In [9]:
trees = generate_block_seqs(source, size=128)

  import sys


In [10]:
trees.to_pickle(root+language+'blocks.pkl')

In [11]:
trees.shape

(3330, 2)

In [12]:
def merge(data_path, blocks):
    pairs = pd.read_pickle(data_path)
    pairs['id1'] = pairs['id1'].astype(int)
    pairs['id2'] = pairs['id2'].astype(int)
    df = pd.merge(pairs, blocks, how='left', left_on='id1', right_on='id')
    df = pd.merge(df, blocks, how='left', left_on='id2', right_on='id')
    df.drop(['id_x', 'id_y'], axis=1,inplace=True)
    df.dropna(inplace=True)
    print(data_path, df.shape)
    df.to_pickle(data_path + '_blocks.pkl')


In [13]:
#gcj cross
merge('./data/train_data/cross-gcj/gcj_sample.pkl', trees)
merge('./data/train_data/cross-gcj/gcj_old.pkl', trees)

./data/train_data/cross-gcj/gcj_sample.pkl (500000, 5)
./data/train_data/cross-gcj/gcj_old.pkl (500000, 5)


In [34]:
# gcj
root = './data/train_data/cbcb/'
for i in ['0','1','2']:
    for label in ['fun','random']:
        for l in ['train', 'test', 'val']:
            f = l+'_'+label + i
            merge(root+f, trees)


./data/train_data/cbcb/train_fun0 (731180, 5)
./data/train_data/cbcb/test_fun0 (118705, 5)
./data/train_data/cbcb/val_fun0 (116898, 5)
./data/train_data/cbcb/train_random0 (429135, 5)
./data/train_data/cbcb/test_random0 (48812, 5)
./data/train_data/cbcb/val_random0 (47776, 5)
./data/train_data/cbcb/train_fun1 (867008, 5)
./data/train_data/cbcb/test_fun1 (84743, 5)
./data/train_data/cbcb/val_fun1 (83048, 5)
./data/train_data/cbcb/train_random1 (429135, 5)
./data/train_data/cbcb/test_random1 (48812, 5)
./data/train_data/cbcb/val_random1 (47776, 5)
./data/train_data/cbcb/train_fun2 (731114, 5)
./data/train_data/cbcb/test_fun2 (118618, 5)
./data/train_data/cbcb/val_fun2 (117004, 5)
./data/train_data/cbcb/train_random2 (429135, 5)
./data/train_data/cbcb/test_random2 (48812, 5)
./data/train_data/cbcb/val_random2 (47776, 5)


In [35]:
# gcj
root = './data/train_data/cbcb/'
for i in ['0','1']:
    for label in ['fun']:
        for l in ['test']:
            f = l+'_'+label + i
            merge(root+f, trees)


Unnamed: 0,id1,id2,label,code_x,code_y
0,4599608,21979462,1,"[[26, [2947], [17, [60], [23]], [423], [21, [4...","[[26, [17, [199]], [4, [22]], [2947], [21, [4,..."
1,15639580,21979462,1,"[[300, [17, [23]], [2947], [2947], [21, [4, [1...","[[26, [17, [199]], [4, [22]], [2947], [21, [4,..."
2,323161,21979462,1,"[[26, [17, [60], [23]], [857], [21, [4, [731]]...","[[26, [17, [199]], [4, [22]], [2947], [21, [4,..."
3,813588,21979462,1,"[[26, [17, [60], [23]], [857], [21, [4, [731]]...","[[26, [17, [199]], [4, [22]], [2947], [21, [4,..."
4,4530695,21979462,1,"[[26, [17, [53]], [4, [22]], [2947], [21, [4, ...","[[26, [17, [199]], [4, [22]], [2947], [21, [4,..."
