In [2]:
import pandas as pd
import itertools

In [3]:
# Load and process ground truth
ground_truth = pd.read_csv("ground-truth-2014.csv").fillna("")
ground_truth["fromGroupArtifacts"] = ground_truth["fromGroupArtifacts"].apply(lambda x: x.split(";"))
ground_truth["toGroupArtifacts"] = ground_truth["toGroupArtifacts"].apply(lambda x: x.split(";"))
valid_rules = set()
for index, row in ground_truth.iterrows():
    valid_rules.update(itertools.product(row["fromGroupArtifacts"], row["toGroupArtifacts"]))
print("{} valid rules before extension".format(len(valid_rules)))

1660 valid rules before extension


In [4]:
# Load recommendation results
results = pd.read_csv("recommend-output-xyl.csv")

# Remove recommendation result with same group ID (No need, seems to be already done in LibraryRecommendJob)
# results = results[results.apply(lambda x: x["fromGroupArtifact"].split(":")[0] != x["toGroupArtifact"].split(":")[0], axis=1)]

# Remove all recommendation result for fromLib if no ground truth rules in any of the (fromLib, toLib) pairs
from_lib_to_remove = set()
for from_lib, df in results.groupby(by="fromGroupArtifact"):
    if all((from_lib, row["toGroupArtifact"]) not in valid_rules for index, row in df.iterrows()):
        from_lib_to_remove.add(from_lib)
results = results[~results["fromGroupArtifact"].isin(from_lib_to_remove)]
print("{} libraries removed, {} remaining".format(len(from_lib_to_remove), len(set(results["fromGroupArtifact"]))))

34 libraries removed, 53 remaining


We use the following evaluation metrics
1. Precision@k
2. Recall@k

In [5]:
top_k = 10
top_rules = [list() for x in range(0, top_k)]
valid_rules_in_result = set(zip(results["fromGroupArtifact"], results["toGroupArtifact"])) & valid_rules
print("{} out of {} rules exist in the data".format(len(valid_rules_in_result), len(valid_rules)))

for from_lib, result in results.groupby(by="fromGroupArtifact"):
    to_libs = result.sort_values(by="confidence", ascending=False)["toGroupArtifact"]
    for k, to_lib in enumerate(to_libs):
        if k >= top_k:
            continue
        top_rules[k].append((from_lib, to_lib))
for k in range(1, top_k):
    top_rules[k] += top_rules[k - 1]
for k in range(0, top_k):
    precision = len([x for x in top_rules[k] if x in valid_rules]) / len(top_rules[k])
    recall = len([x for x in top_rules[k] if x in valid_rules]) / len(valid_rules_in_result)
    f_measure = 2 * precision * recall / (precision + recall)
    print("Top {:2}: Precision = {:0.3f}, Recall = {:0.3f}, F-Measure = {:0.3f}".format(k + 1, precision, recall, f_measure))

164 out of 1660 rules exist in the data
Top  1: Precision = 0.830, Recall = 0.268, F-Measure = 0.406
Top  2: Precision = 0.704, Recall = 0.421, F-Measure = 0.527
Top  3: Precision = 0.621, Recall = 0.530, F-Measure = 0.572
Top  4: Precision = 0.527, Recall = 0.585, F-Measure = 0.555
Top  5: Precision = 0.462, Recall = 0.628, F-Measure = 0.532
Top  6: Precision = 0.420, Recall = 0.677, F-Measure = 0.519
Top  7: Precision = 0.388, Recall = 0.720, F-Measure = 0.504
Top  8: Precision = 0.355, Recall = 0.744, F-Measure = 0.480
Top  9: Precision = 0.326, Recall = 0.762, F-Measure = 0.457
Top 10: Precision = 0.306, Recall = 0.787, F-Measure = 0.440
