In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.linear_model import LogisticRegression
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re

dataset = load_files("code-clone/")
docs = dataset.data
y = dataset.target

In [18]:
# useful methods
def evaluate_k_fold(v, df, y, cv=10):
    train = v.fit_transform(df)
    model = LogisticRegression(random_state=3, solver="liblinear", penalty="l1")
    score = cross_val_score(model, train, y, cv=cv)
    return "Accuracy for Logistic Regression: %0.4f (+/- %0.4f)" % (score.mean(), score.std() * 2)

def separate_methods(docs):
    for doc in docs:
        document = doc.decode()
        left, right = document.split('[CLS]')
        yield left, right

In [19]:
# Given code from sklearn.datasets.load_files(), extract list of unique class names.
# Current strategy is look for 'new', and the word after it is probably a class name.
def extract_classes(code):
    retVal = set()
    for file in code:
        tokens = word_tokenize(file.decode())
        for index, token in enumerate(tokens):
            if token == 'new':
                retVal.add(tokens[index+1])
    return retVal


# Counts Number of Unique Classes in each Function

In [20]:
class SimilarityScorer(BaseEstimator):
    def fit(self, data, unused):
        self.unique_classes = extract_classes(data)
        self.unique_classes.update({"byte", "short", "int", "long", "float", "double", "boolean", "String"})
        return self
        
    def transform(self, doc):
        retVal = []
        
        
        for file in doc:
            tokens = word_tokenize(file.decode())
            score = 0
            class_count = {}
            
            # First function, we want to count the number of times we
            # see a class being used
            done = False
            index = 0
            while done == False and index < len(tokens):
                token = tokens[index]
                if token.lower() == "cls" and tokens[index-1] == '[' and tokens[index+1] == ']':
                    done = True
                if token in self.unique_classes:
                    if token in class_count:
                        class_count[token] += 1
                    else:
                        class_count[token] = 1
                index += 1
                        
            # Here, we are on the 2nd function (usually)
            # or this file doesn't have a 2nd function (unlikely).
            # Now basically we want to see if the classes encountered here
            # are also classes that haved used in the first function.
            # Add 1 to similarity score for each class both functions
            # are using
            while index < len(tokens):
                token = tokens[index]
                if token in self.unique_classes and token in class_count and class_count[token] > 0:
                    class_count[token] -= 1
                    score += 1
                index += 1
                
            retVal.append(score)
        return pd.DataFrame(retVal)
                

In [21]:
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("objects", SimilarityScorer(), "classes")
])
df = pd.DataFrame({"docs": docs, "classes": docs})
evaluate_k_fold(v, df, y, cv=10)

'Accuracy for Logistic Regression: 0.8260 (+/- 0.0460)'

# Count the java keywords on each method. Return cosine similarity of count vectors

In [22]:
import numpy as np
from nltk import word_tokenize

java_kwords = {
    'abstract', 'continue', 'for', 'new',
    'switch', 'assert', 'default', 'goto',
    'package', 'synchronized' 'boolean', 'do',
    'if', 'private', 'this','break', 'double',
    'implements', 'protected', 'throw', 'byte', 
    'else', 'import', 'public', 'throws', 'case',
    'enum', 'instanceof', 'return', 'transient',
    'catch', 'extends', 'int', 'short', 'try', 
    'char', 'final', 'interface', 'static', 'void',
    'class', 'finally', 'long', 'strictfp', 'while',
    'volatile', 'const', 'float', 'native', 'super'
}

java_kword_vectorizer = CountVectorizer()
java_kword_vectorizer.fit(java_kwords)

def get_keyword_vectorizer(code):
    features = list()
    tokens = word_tokenize(code)
    for token in tokens:
        if token in java_kwords:
            features.append(token)
    return java_kword_vectorizer.transform([' '.join(features)])

In [23]:
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity

class SimilarityScorerKeywords(BaseEstimator):
    def fit(self, data, unused):
        return self
        
    def transform(self, documents):
        retVal = []
        
        for left, right in separate_methods(documents):
            left_counter = get_keyword_vectorizer(left)            
            right_counter = get_keyword_vectorizer(right)
            score = cosine_similarity(left_counter.todense(), right_counter.todense())
            retVal.append(score[0][0])
            # this returns the count as an array
            #score = np.hstack([left_counter.todense()[0], right_counter.todense()[0]])
            #retVal.append(np.array(score.flatten())[0])
        return pd.DataFrame(retVal)

In [24]:
v = ColumnTransformer([
    ("d", CountVectorizer(ngram_range=(1,2)), "docs"),
    ("k", SimilarityScorerKeywords(), "kwords"),
    
])

df = pd.DataFrame({"docs": docs, "kwords": docs})
evaluate_k_fold(v, df, y, cv=10)































'Accuracy for Logistic Regression: 0.7720 (+/- 0.0700)'

# Combining The Class Count and Keyword Estimators

In [25]:
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("keyword", SimilarityScorerKeywords(), "kwds"),
    ("objects", SimilarityScorer(), "cls")
    
])

df = pd.DataFrame({"docs": docs, "kwds": docs, "cls": docs})
evaluate_k_fold(v, df, y, cv=10)































'Accuracy for Logistic Regression: 0.8320 (+/- 0.0395)'

# Creates features out of the functions signature. Specifically, the number of arguments and the return type

In [26]:
regex = "^[ \t]*(?:(?:public|private|protected|static|final|native"
regex += "|synchronized|abstract|transient|@Override|@Test)+\s+)+"
regex += "[$_\w<>\[\]\s]*\s+[\$_\w]+\([^\)]*\)?\s*"
non_kword_regex = "^[ \t]*[$_\w<>\[\]\s]*\s+[\$_\w]+\([^\)]*\)?\s*"
def get_java_func_signature(code):
    match = re.match(regex, code)
    ret_val_idx = -2
    if not match:
        match = re.match(non_kword_regex, code)
        ret_val_idx = -1
        if not match:
            return "void", []
    # remove )
    string = match.group(0).lstrip()[:-2]
    signature, args = string.split("(")
    args = args.split(", ")
    signature = signature.split(" ")
    ret_value = signature[ret_val_idx]
    return ret_value, args

In [27]:
class SimilarityScoreArgCount(BaseEstimator):
    """
    Evaluates based on the number of arguments that each method has.
    The score is len(left_args) - len(right_args)
    """
    def fit(self, data, unused):
        return self
        
    def transform(self, documents):
        args_diff = []
        for left, right in separate_methods(documents):
            _, largs = get_java_func_signature(left)
            _, rargs = get_java_func_signature(right)
            args_diff.append(len(largs) - len(rargs))
        return pd.DataFrame(args_diff)


class SimilarityScoreRetVals(BaseEstimator):
    """
    Score based on the return type of each method.
    """
    def fit(self, data, unused):
        self.unique_classes = set()
        for left, right in separate_methods(data):
            left_ret_val, _ = get_java_func_signature(left)
            self.unique_classes.add(left_ret_val)
            right_ret_val, _ = get_java_func_signature(right)
            self.unique_classes.add(right_ret_val)
        return self
        
    def transform(self, documents):
        vals = []
        vectorizer = CountVectorizer()
        vectorizer.fit(self.unique_classes)
        vocab = vectorizer.vocabulary_
        for left, right in separate_methods(documents):
            ret_type_l, _ = get_java_func_signature(left)
            ret_type_r, _ = get_java_func_signature(right)
            lret_id = vocab.get(ret_type_l.lower(), -1)
            rret_id = vocab.get(ret_type_r.lower(), -1)
            vals.append([lret_id, rret_id])
        return pd.DataFrame(vals)

In [28]:
# Results for argument count
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("a", SimilarityScoreArgCount(), "argcount"),
])

df = pd.DataFrame({"docs": docs, "argcount": docs})
evaluate_k_fold(v, df, y, cv=10)

'Accuracy for Logistic Regression: 0.7705 (+/- 0.0662)'

In [30]:
# Results for retvals
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("r", SimilarityScoreRetVals(), "retvals"),
    ("k", SimilarityScorerKeywords(), "kwords"),
    
])

df = pd.DataFrame({"docs": docs, "retvals": docs, "kwords": docs})
evaluate_k_fold(v, df, y, cv=10)































'Accuracy for Logistic Regression: 0.7655 (+/- 0.0609)'

In [31]:
# combined results

v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("r", SimilarityScoreRetVals(), "retvals"),
    ("a", SimilarityScoreArgCount(), "argcount")
])

df = pd.DataFrame({"docs": docs, "retvals": docs, "argcount": docs})
evaluate_k_fold(v, df, y, cv=10)

'Accuracy for Logistic Regression: 0.7660 (+/- 0.0572)'

# Best Combination So far

In [32]:
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("a", SimilarityScoreArgCount(), "argcount"),
    ("o", SimilarityScorer(), "classes"),
    ("k", SimilarityScorerKeywords(), "kwords"),
])

df = pd.DataFrame({"docs": docs, "kwords": docs,
                   "argcount": docs, "classes": docs})
evaluate_k_fold(v, df, y, cv=10)































'Accuracy for Logistic Regression: 0.8315 (+/- 0.0422)'

# Using word2vec for method names

In [34]:
left_embeddings = load_files('code-clone-method-embeddings-left', encoding="utf-8")
right_embeddings = load_files('code-clone-method-embeddings-right', encoding="utf-8")

In [35]:
def rename_cols(prefix):
    return dict(zip(range(300), [f'{prefix}{i}' for i in range(300)]))

index = dict(enumerate(dataset.filenames))

les = pd.Series(left_embeddings.data, dtype='str', name='left')
le = les.str.split(',', expand=True)
le = le.rename(columns=rename_cols('l')).astype('float')

_res = pd.Series(right_embeddings.data, dtype='str', name='right')
_re = _res.str.split(',', expand=True)
_re = _re.rename(columns=rename_cols('r')).astype('float')

body = pd.Series(dataset.data, name='body')

embeddings_df = pd.concat([le, _re], axis=1)

In [36]:
print(embeddings_df.shape)

(2000, 600)


In [43]:
v = ColumnTransformer([
    ("docs", CountVectorizer(), "docs"),
], remainder='passthrough')

df = pd.concat([pd.Series(dataset.data, name='docs'), le, _re],  axis=1)

evaluate_k_fold(v, df, y, cv=10)

'Accuracy for Logistic Regression: 0.7590 (+/- 0.0595)'

### Using w2v embeddings for all java tokens

In [3]:
left_token_embeddings = load_files('code-clone-embeddings-left', encoding="utf-8")
right_token_embeddings = load_files('code-clone-embeddings-right', encoding="utf-8")

In [44]:
def rename_cols(prefix):
    return dict(zip(range(128), [f'{prefix}{i}' for i in range(128)]))

index = dict(enumerate(dataset.filenames))

les = pd.Series(left_token_embeddings.data, dtype='str', name='left')
le = les.str.split(',', expand=True)
le = le.rename(columns=rename_cols('l')).astype('float')

_res = pd.Series(right_token_embeddings.data, dtype='str', name='right')
_re = _res.str.split(',', expand=True)
_re = _re.rename(columns=rename_cols('r')).astype('float')

body = pd.Series(dataset.data, name='body')

tokens_df = pd.concat([le, _re], axis=1)

In [45]:
print(tokens_df.shape)

(2000, 256)


In [46]:
v = ColumnTransformer([
    ("docs", CountVectorizer(), "docs"),
], remainder='passthrough')

df = pd.concat([pd.Series(dataset.data, name='docs'), le, _re],  axis=1)

evaluate_k_fold(v, df, y, cv=10)

'Accuracy for Logistic Regression: 0.7680 (+/- 0.0697)'

# Combining all features

In [47]:
previous_features_df = pd.DataFrame(
    {"docs": docs, "sign": docs, 
     "kwds": docs, "cls": docs, 
#      "dist": docs
    }
)

In [48]:
v = ColumnTransformer([
    ("d", CountVectorizer(), "docs"),
    ("signature", SimilarityScoreArgDifference(), "sign"),
    #("retvaldist", SimilarityScoreRetValDist(), "dist"), # this made prediction worse
    ("keyword", SimilarityScorerKeywords(), "kwds"),
    ("objects", SimilarityScorer(), "cls"),
#     ("body", CountVectorizer(ngram_range=(1, 2)), "body"),
], remainder="passthrough")

df = pd.concat([tokens_df, embeddings_df, previous_features_df], axis=1)


NameError: name 'SimilarityScoreArgDifference' is not defined

In [19]:
df

Unnamed: 0,l0,l1,l2,l3,l4,l5,l6,l7,l8,l9,...,r294,r295,r296,r297,r298,r299,docs,sign,kwds,cls
0,-0.164062,-0.068359,0.167969,-0.045898,-0.045654,-0.087402,0.018921,0.032959,-0.021729,0.175781,...,-0.078033,0.095947,0.119812,-0.202148,-0.011719,-0.048584,b'public static void main(String[] args) throw...,b'public static void main(String[] args) throw...,b'public static void main(String[] args) throw...,b'public static void main(String[] args) throw...
1,0.033407,0.071126,0.124512,0.116536,-0.044474,0.053385,-0.039530,-0.269857,0.033203,0.068156,...,0.097656,-0.098267,0.040039,-0.346680,-0.225830,0.107422,b'private void onDhReply(final SshDhReply msg)...,b'private void onDhReply(final SshDhReply msg)...,b'private void onDhReply(final SshDhReply msg)...,b'private void onDhReply(final SshDhReply msg)...
2,0.051025,-0.228516,0.103027,0.004944,-0.104980,0.051025,0.145508,0.076172,0.048096,0.055664,...,-0.196777,-0.152344,0.116577,0.039246,-0.036255,0.188965,b'public static final String MD5(String value)...,b'public static final String MD5(String value)...,b'public static final String MD5(String value)...,b'public static final String MD5(String value)...
3,0.085938,0.024658,-0.165039,0.041016,-0.106445,-0.156250,-0.022949,0.148438,0.112305,0.257812,...,0.093628,0.078528,0.011108,-0.127930,0.027832,0.035970,b'private static void zip(ZipOutputStream aOut...,b'private static void zip(ZipOutputStream aOut...,b'private static void zip(ZipOutputStream aOut...,b'private static void zip(ZipOutputStream aOut...
4,-0.020966,0.126465,-0.006226,0.083374,-0.072998,-0.062378,0.070892,-0.164185,0.009949,0.130280,...,0.078320,-0.087463,-0.079102,-0.052429,-0.060394,0.026154,"b'private String AddAction(ResultSet node, Str...","b'private String AddAction(ResultSet node, Str...","b'private String AddAction(ResultSet node, Str...","b'private String AddAction(ResultSet node, Str..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.337891,0.198242,-0.296875,0.148438,-0.217773,-0.036865,-0.005829,-0.121094,0.142578,-0.050537,...,0.060669,0.053223,-0.138184,0.032684,0.077026,-0.123779,b'public Object[] bubblesort(Object[] tosort) ...,b'public Object[] bubblesort(Object[] tosort) ...,b'public Object[] bubblesort(Object[] tosort) ...,b'public Object[] bubblesort(Object[] tosort) ...
1996,0.078125,-0.038513,-0.137756,0.172852,-0.122559,-0.013794,0.079956,0.034790,0.027161,0.019897,...,0.394531,-0.138672,-0.273438,0.067383,-0.191406,0.261719,b'@Override protected void doGet(HttpServletRe...,b'@Override protected void doGet(HttpServletRe...,b'@Override protected void doGet(HttpServletRe...,b'@Override protected void doGet(HttpServletRe...
1997,-0.040009,-0.022766,-0.020905,0.013359,-0.228638,0.032959,0.012756,0.012512,0.083313,0.023621,...,0.068671,-0.097803,-0.039502,-0.086902,-0.166650,0.135059,b'private static void insertModuleInEar(File f...,b'private static void insertModuleInEar(File f...,b'private static void insertModuleInEar(File f...,b'private static void insertModuleInEar(File f...
1998,-0.001971,-0.034294,0.060634,0.079738,-0.061131,0.084874,0.033482,-0.008929,0.100133,0.055263,...,-0.041504,0.260742,0.163818,0.078369,0.069580,-0.006836,b'public void testCodingBeyondContentLimitFrom...,b'public void testCodingBeyondContentLimitFrom...,b'public void testCodingBeyondContentLimitFrom...,b'public void testCodingBeyondContentLimitFrom...


In [22]:
lr_model = LogisticRegression(random_state=3, solver='liblinear', penalty='l2')

X_train = scaler.fit_transform(v.fit_transform(df))
y_train = dataset.target

lr_cross_val_score = cross_val_score(lr_model, X_train, y_train, cv=10)

In [23]:
print("Accuracy for Logistic Regression: %0.4f (+/- %0.4f)" % (lr_cross_val_score.mean(), lr_cross_val_score.std() * 2))

Accuracy for Logistic Regression: 0.8260 (+/- 0.0705)
