Check the quality of the embeddings here

In [1]:
import ast
import json
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

model = Doc2Vec.load("doc2vec.libraries.io.200.100.2.5.20")
libs = pd.read_csv("../export/GroupArtifact.csv").set_index("id")
rules = pd.read_csv("../test_data/ground_truth.csv", sep=";")
rules["fromIds"] = rules["fromIds"].apply(lambda x: ast.literal_eval(x))
rules["toIds"] = rules["toIds"].apply(lambda x: ast.literal_eval(x))

lib_ids = set([i for a in rules["fromIds"] for i in a] + [i for a in rules["toIds"] for i in a])
lib_set = set(rules["fromLibrary"]) | set(rules["toLibrary"])
ga_set = set(map(lambda x: libs["groupId"][x] + ":" + libs["artifactId"][x], lib_ids))
rules_set = set(rules["fromLibrary"] + " " + rules["toLibrary"])

In [2]:
test_libs = ["org.json:json", "com.alibaba:fastjson", "org.postgresql:postgresql", "commons-io:commons-io", "org.apache.opennlp:opennlp-tools",
            "log4j:log4j", "org.slf4j:slf4j-log4j12", "com.google.guava:guava", "commons-httpclient:commons-httpclient"]
for l in test_libs:
    print(l, model.wv.most_similar(l))

org.json:json [('junit:junit', 0.5462886691093445), ('org.wso2.orbit.commons-fileupload:commons-fileupload', 0.45649901032447815), ('p001.plugins:plugin022', 0.45633548498153687), ('com.google.code.gson:gson', 0.45601993799209595), ('nu.xom:xom', 0.45287811756134033), ('com.connexience:monitoring-common', 0.44636374711990356), ('at.ac.tuwien.big:testsuite-api', 0.4422326683998108), ('com.flaptor.hist4j:hist4j', 0.43952322006225586), ('org.krohm.gameengine:Entities', 0.4378364682197571), ('com.ctc.wstx:woodstox-osgi', 0.4362933039665222)]
com.alibaba:fastjson [('net.sf.json-lib:json-lib', 0.42310240864753723), ('com.aaron.common.api:common_api', 0.37858834862709045), ('com.google.code.gson:gson', 0.36803483963012695), ('com.qiniu:qiniu-java-sdk', 0.3567369282245636), ('org.mybatis.generator:mybatis-generator-core', 0.3492923378944397), ('com.edcs.tds:tds-storm-common', 0.33768564462661743), ('org.redisson:redisson-spring-data-20', 0.33767974376678467), ('zhangpai-rsc:zhangpai-rabbit-fra

In [3]:
[name for name in ga_set if name not in model.wv]

['io.github.benas:xstream',
 'com.netflix.Nicobar:nicobar-manager',
 'de.ebf:spring-granular-permissions',
 'org.keycloak:keycloak-appliance-dist-all',
 'org.javabits.jgrapht:jgrapht',
 'de.softwareforge:dumbster',
 'org.jasig.portal:uPortal-security-permissions',
 'com.liferay:com.liferay.application.list.user.personal.site.permissions',
 'org.kie.modules:org-apache-mina',
 'io.paradoxical:json-path',
 'org.dellroad:leveldb']

$$
Quality = \frac{1}{|R|}\sum_{(l_1, l_2) \in R}\cos(l_1, l_2) - \frac{1}{|\bar{R}|}\sum_{(l_1, l_2) \in \bar{R}}\cos(l_1, l_2)
$$

In [4]:
from numpy import dot
from numpy.linalg import norm
def idx2ga(idx):
    return libs["groupId"][idx] + ":" + libs["artifactId"][idx]
score = 0
ga_true_positive = set()
ga_true_negative = set()
list_of_libraries = rules["fromIds"] + rules["toIds"]
#print(list_of_libraries)
for l in list_of_libraries:
    for x in l:
        for y in l:
            if x != y and idx2ga(x) in model.wv and idx2ga(y) in model.wv:
                ga_true_positive.add((idx2ga(x), idx2ga(y)))
for x in ga_set:
    for y in ga_set:
        if (x, y) not in ga_true_positive:
            if x != y and x in model.wv and y in model.wv:
                ga_true_negative.add((x, y))
#print(len(ga_true_positive), len(ga_true_negative))
#print(list(ga_true_positive)[0:10], list(ga_true_negative)[0:10])
r1 = 1 / len(ga_true_positive) 
r2 = 1 / len(ga_true_negative)
for l1, l2 in ga_true_positive:
    score += model.wv.distance(l1, l2) * r1
for l1, l2 in ga_true_negative:
    score -= model.wv.distance(l1, l2) * r2
print("Score: ", score)

Score:  -0.07007745479810075


|  Data            |   Parameters     | Quality  |
|------------------|------------------|----------|
| Full POM, (0, 4) | 200.100.2.1.20   | -0.05033 |
| Full POM, (0, 4) | 200.100.5.1.20   | -0.04515 |
| Full POM, (0, 4) | 200.100.2.5.20   | -0.03154 |
| Full POM, (0, 4) | 100.100.10.1.20  | -0.05737 |
| Full POM, (0, 4) | 100.2.10.100.20  | -0.07002 |
| LibrariesIO      | 200.100.2.5.20   | -0.07007 |
| LibrariesIO      | 200.100.5.5.20   | -0.06718 |
| LibrariesIO      | 200.100.20.5.20  | -0.06169 |

In [5]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
X_name = [name for name in ga_set if name in model.wv]
X = np.array([model.wv.get_vector(name) for name in X_name])
X_embedded = TSNE(n_components=2, perplexity=5).fit_transform(X)
fig, ax = plt.subplots(figsize=(100, 100))
ax.scatter([x[0] for x in X_embedded], [x[1] for x in X_embedded])
for i, x in enumerate(X_embedded):
    ax.annotate(X_name[i], (x[0], x[1]))
fig.savefig("tsne.png")