# Preprocess code-clone dataset
## Generates and loads w2v embeddings for each pair of functions in the dataset

In [1]:
# Environment Constants

## Download token_vecs.txt from https://s3.amazonaws.com/code2vec/model/token_vecs.tar.gz
## Run `tar -xvzf token_vecs.tar.gz` to extract
pretrained_w2v_model_path = 'data/word2vec-trained/tokens/token_vecs.txt'
code_clone_dataset_path = 'code-clone'
token_divider = '[cls]'

## There should be a 'pos' and 'neg' subdirectory under each of these:
left_embeddings_output_dir = 'code-clone-embeddings-left'
right_embeddings_output_dir = 'code-clone-embeddings-right'

In [2]:
# Imports

from gensim.models import KeyedVectors as word2vec
from gensim.test.utils import datapath
from os import walk, path
from sklearn.datasets import load_files

import numpy as np

In [3]:
# Import the pretrained Java w2v token embeddings

tokens = word2vec.load_word2vec_format(pretrained_w2v_model_path, binary=False)
tokens.most_similar(positive=['equals', 'tolower'])

[('equalsignorecase', 0.5233527421951294),
 ('areequal', 0.47283560037612915),
 ('identityequals', 0.4442278742790222),
 ('indexrangekey', 0.4407910704612732),
 ('browserelements', 0.43472132086753845),
 ('streamq', 0.4342491328716278),
 ('equal', 0.43297117948532104),
 ('isequal', 0.4325423240661621),
 ('negativearrowvisible', 0.43036386370658875),
 ('pwdlen', 0.42697927355766296)]

In [4]:
def get_document_embeddings(data):
    """Process one document: get the average of the token embeddings for the left and right functions."""
    left = []
    right = []
    current = left

    for token in (t.lower() for t in ' '.join(data).split()):
        if token == token_divider:
            current = right
        elif token in tokens:
            current.append(tokens[token])

    return np.mean(left, axis=0), np.mean(right, axis=0)

In [5]:
def np_array_to_str(array):
    """custom str method for numpy array that joins the elements with `,`"""
    return ','.join((str(x) for x in array))

In [6]:
# Iterate over the code-clone dataset, generate the document embeddings and save them to the output directories

for (dirpath, dirnames, filenames) in walk(code_clone_dataset_path):
    for filename in filenames:
        file = path.join(dirpath, filename)
        pos = 'pos' if 'pos' in dirpath else 'neg'
        output_l = path.join(left_embeddings_output_dir, pos, filename)
        output_r = path.join(right_embeddings_output_dir, pos, filename)

        with open(file, 'r') as f:
            left, right = get_document_embeddings(f.readlines())
            with open(output_l, "w") as out:
                out.write(np_array_to_str(left))
            with open(output_r, "w") as out:
                out.write(np_array_to_str(right))

In [7]:
# Import the embeddings we just generated into memory

left_embeddings = load_files(left_embeddings_output_dir)
right_embeddings = load_files(right_embeddings_output_dir)

print(left_embeddings.data[0])


b'-0.05122124,-0.12216224,0.101226486,-0.021237344,-0.042222623,0.05646271,0.044890355,0.121772684,-0.1112092,0.19515315,-0.031402837,0.07099562,0.0016093374,-0.0136487335,-0.10884587,-0.09148396,0.10080034,0.12520233,-0.03916842,0.055381935,0.08611373,-0.18527326,0.058904864,-0.02254408,0.16800413,0.072488315,0.09183001,0.027677659,0.0319309,0.16355889,-0.09993939,-0.06628542,-0.019778844,0.13314123,0.025044981,0.046441235,-0.22238311,-0.028121132,-0.0544777,-0.08333411,0.12823798,-0.03361119,0.023782631,-0.15554741,0.10700291,0.056919258,-0.06639689,-0.0538708,-0.12804495,0.08812185,-0.106228866,-0.14831069,0.17591883,0.05220724,0.032535713,-0.036819734,0.02613903,-0.15982021,0.04480314,-0.0609802,0.22196603,0.010189606,0.18794441,0.13692869,-0.00090712175,0.093317926,0.014242549,-0.023852697,-0.09821846,0.09576436,-0.034287006,-0.03790975,-0.0057899123,-0.06626237,0.05633267,-0.052551094,-0.10690658,-0.11371188,-0.017713418,0.096476145,0.017423538,0.13760008,-0.05342849,0.011066714,