# Create and load w2v embeddings for method names

In [12]:
# Environment Constants

code_clone_dataset_path = 'code-clone'

## There should be a 'pos' and 'neg' subdirectory under each of these:
left_embeddings_output_dir = 'code-clone-method-embeddings-left'
right_embeddings_output_dir = 'code-clone-method-embeddings-right'

In [13]:
# Imports

from re import finditer
from os import walk, path
from sklearn.datasets import load_files

import re
import os
import gensim.downloader as api
import numpy as np

In [14]:
# create needed directories
for directory in [left_embeddings_output_dir, right_embeddings_output_dir]:
    for subdir in ['pos', 'neg']:
        final_dir = path.join(directory, subdir)
        os.makedirs(final_dir, exist_ok=True)

In [15]:
def camel_case_split(identifier):
    """Splits a method name by camel case doSomething -> [do, something]."""
    # https://stackoverflow.com/a/29920015
    matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0).lower() for m in matches]

In [16]:
def get_method_names(document):
    """Extract the two method names from a code-clone document."""
    left, right = document.split('[CLS]')
    pat = re.compile(r'(?:public|private|static|protected|abstract|native|synchronized)?\s*\w+\s*(\w+)\s*\(')
    match_l = pat.search(left)
    match_r = pat.search(right)    
    return camel_case_split(match_l.group(1)), camel_case_split(match_r.group(1))

In [17]:
def get_document_embeddings(data):
    """Process one document: get the average of the token embeddings for the left and right function names."""
    left = []
    right = []
    
    left_f, right_f = get_method_names(' '.join(data))

    for token in left_f:
        if token in model:
            left.append(model[token])
            
    for token in right_f:
        if token in model:
            right.append(model[token])
            
    if len(left) == 0:
        left.append(np.random.normal(0, 0.1, size=(300,)))
    if len(right) == 0:
        right.append(np.random.normal(0, 0.1, size=(300,)))

    return np.mean(left, axis=0), np.mean(right, axis=0)

In [18]:
def np_array_to_str(array):
    """custom str method for numpy array that joins the elements with `,`"""
    return ','.join((str(x) for x in array))

In [19]:
%%capture
# Load the w2v embeddings trained on google news 300 dataset

model = api.load("word2vec-google-news-300");

In [20]:
# Iterate over the code-clone dataset, generate the document embeddings and save them to the output directories

for (dirpath, dirnames, filenames) in walk(code_clone_dataset_path):
    for filename in filenames:
        file = path.join(dirpath, filename)
        pos = 'pos' if 'pos' in dirpath else 'neg'
        output_l = path.join(left_embeddings_output_dir, pos, filename)
        output_r = path.join(right_embeddings_output_dir, pos, filename)

        with open(file, 'r') as f:
            left, right = get_document_embeddings(f.readlines())
            with open(output_l, "w") as out:
                out.write(np_array_to_str(left))
            with open(output_r, "w") as out:
                out.write(np_array_to_str(right))

In [21]:
# Load the embeddings that we just generated

left_embeddings = load_files(left_embeddings_output_dir, encoding="utf-8")
right_embeddings = load_files(right_embeddings_output_dir, encoding="utf-8")

In [22]:
print(left_embeddings.data[0])

-0.1640625,-0.068359375,0.16796875,-0.045898438,-0.045654297,-0.087402344,0.018920898,0.032958984,-0.021728516,0.17578125,0.18164062,0.06689453,0.24023438,0.20117188,-0.14257812,-0.1953125,0.060546875,-0.20898438,0.036865234,-0.12402344,0.024291992,0.1015625,-0.18261719,0.36132812,-0.09765625,-0.13476562,-0.049316406,0.14648438,0.17480469,-0.09716797,-0.0546875,-0.18652344,0.23925781,-0.04638672,-0.041748047,-0.078125,-0.30273438,0.17675781,-0.033691406,-0.16796875,0.049804688,-0.07421875,0.17773438,0.0625,-0.020019531,0.07470703,-0.017089844,-0.033203125,-0.036865234,-0.028320312,0.028930664,0.1328125,-0.064453125,-0.08105469,-0.084472656,-0.0015106201,-0.20800781,-0.048583984,0.21875,0.07128906,-0.06689453,0.30664062,0.09033203,0.0119018555,0.17871094,-0.12890625,-0.078125,-0.016357422,-0.07128906,0.07324219,-0.064453125,0.16601562,0.40429688,0.026000977,-0.35546875,0.032226562,0.09082031,0.09375,-0.0703125,0.02355957,0.08496094,-0.09277344,0.099609375,0.095214844,-0.14160156,-0.0961