In [1]:
import pandas as pd
from collections import Counter

In [2]:
possible_ground_truth = pd.read_csv("possible-ground-truth-2014.csv")
possible_migrations = pd.read_csv("possible-migrations.csv")
confirmed_migrations = pd.read_excel("manual/confirmed-migrations.xlsx")

In [3]:
rule_counter = Counter(zip(possible_migrations["fromLib"], possible_migrations["toLib"]))
possible_ground_truth["inDepSeq"] = possible_ground_truth.apply(
    lambda row: rule_counter[(row["fromGroupArtifact"], row["toGroupArtifact"])], 
    axis=1
)
confirmed_rules = set(zip(confirmed_migrations["fromLib"], confirmed_migrations["toLib"]))
possible_ground_truth["dataConfirmed"] = possible_ground_truth.apply(
    lambda row: (row["fromGroupArtifact"], row["toGroupArtifact"]) in confirmed_rules, 
    axis=1
)
print("# inferred rules = {}, # rules in dep seq = {}, # rules confirmed = {}".format(
    len(possible_ground_truth),
    len(possible_ground_truth[possible_ground_truth["inDepSeq"] > 0]),
    len(possible_ground_truth[possible_ground_truth["dataConfirmed"]])
))
print("# from libs in depseq = {}, # from libs confirmed = {}".format(
    len(set(possible_ground_truth[possible_ground_truth["inDepSeq"] > 0]["fromGroupArtifact"])),
    len(set(possible_ground_truth[possible_ground_truth["dataConfirmed"]]["fromGroupArtifact"]))
))
print("# confirmed rules with in dep seq < 8 = {}".format(
    len(possible_ground_truth[(possible_ground_truth["inDepSeq"] < 8) & possible_ground_truth["dataConfirmed"]])
))

# inferred rules = 3878, # rules in dep seq = 1588, # rules confirmed = 289
# from libs in depseq = 233, # from libs confirmed = 94
# confirmed rules with in dep seq < 8 = 23


In [4]:
possible_ground_truth.to_csv("possible-ground-truth-2014.csv", index=False)

In [5]:
xyl_output = pd.read_csv("recommend-output-xyl.csv")

# Remove all recommendation result for fromLib if no ground truth rules in any of the (fromLib, toLib) pairs
from_lib_to_remove = set()
for from_lib, df in xyl_output.groupby(by="fromGroupArtifact"):
    if all((from_lib, row["toGroupArtifact"]) not in rule_counter for index, row in df.iterrows()):
        from_lib_to_remove.add(from_lib)
xyl_output = xyl_output[~xyl_output["fromGroupArtifact"].isin(from_lib_to_remove)]
print("{} libraries removed, {} remaining".format(len(from_lib_to_remove), len(set(xyl_output["fromGroupArtifact"]))))

43 libraries removed, 44 remaining
