In [1]:
import pymongo
import pandas as pd
import numpy as np

In [2]:
rules = pd.read_csv("possible-ground-truth-2014.csv")
possible_rules = rules[rules["inDepSeq"] > 0]
confirmed_rules = rules[rules["dataConfirmed"]]
len(possible_rules), len(confirmed_rules)

(1588, 289)

In [3]:
# Load a precomputed API support file from last version
# TODO: We should replace this with a better one
# (from_lib, to_lib) -> counter (int)
api_support = dict()
api_support_df = pd.read_csv("../export/APISupport.csv")
libs = pd.read_csv("../export/GroupArtifact.csv").fillna("")
id2lib = dict()
for idx, row in libs.iterrows():
    id2lib[row["id"]] = row["groupId"] + ":" + row["artifactId"]
for idx, row in api_support_df.iterrows():
    api_support[(id2lib[row["fromId"]], id2lib[row["toId"]])] = row["counter"]
len(api_support)

4934676

In [4]:
db = pymongo.MongoClient("mongodb://migration_helper:HeHMgt2020@da1.eecs.utk.edu:27020/migration_helper"
                           "?authSource=migration_helper").migration_helper

In [5]:
seqs = []
for seq in db.wocDepSeq3.find():
    seqs.append(seq)
len(seqs)

404339

In [9]:
# simplify depseq using the following heuristics:
# 1. if a removed library does not exist in current sequence, the change is dropped
# 2. if an added library is already added in current sequence, the change is dropped
# 3. remove item with no dep change
for seq in seqs:
    curr_libs = set()
    for item in seq["seq"]:
        added_libs = set(chg[1:] for chg in item["changes"] if chg.startswith("+") and chg[1:] not in curr_libs)
        removed_libs = set(chg[1:] for chg in item["changes"] if chg.startswith("-") and chg[1:] in curr_libs)
        curr_libs = (curr_libs | added_libs) - removed_libs
        item["changes"] = ["+" + lib for lib in added_libs] + ["-" + lib for lib in removed_libs]
    seq["seq"] = [item for item in seq["seq"] if len(item["changes"]) > 0]

In [10]:
possible_ground_truth = pd.read_csv("possible-ground-truth-2014.csv")
from2tolibs = dict()
for from_lib, rows in possible_ground_truth.groupby(by="fromGroupArtifact"):
    from2tolibs[from_lib] = list(rows["toGroupArtifact"])
# (from_lib, to_lib) -> {ruleFreq, concurrence, distance, apiSupport}
all_rules = dict()
for seq in seqs:
    for i, item in enumerate(seq["seq"]):
        for chg in item["changes"]:
            if chg.startswith("-") and chg[1:] in from2tolibs:
                from_lib = chg[1:]
                end_id = i
                start_id = 0
                for j in range(0, i + 1):
                    if "+" + from_lib in seq["seq"][j]["changes"]:
                        start_id = j
                occurred_to_libs = set()
                for k in range(start_id, end_id + 1):
                    for chg2 in seq["seq"][k]["changes"]:
                        if chg2.startswith("+"):
                            to_lib = chg2[1:]
                            if to_lib == from_lib or to_lib in occurred_to_libs:
                                continue
                            occurred_to_libs.add(to_lib)
                            if (from_lib, to_lib) in all_rules:
                                all_rules[(from_lib, to_lib)]["ruleFreq"] += 1
                                all_rules[(from_lib, to_lib)]["distance"].append(end_id - k)
                            else:
                                all_rules[(from_lib, to_lib)] = {
                                    "ruleFreq": 1,
                                    "concurrence": 0,
                                    "addedTogether": 0,
                                    "distance": [end_id - k],
                                    "apiCounter": 0,
                                }
for seq in seqs:
    for item in seq["seq"]:
        added = set()
        for chg in item["changes"]:
            if chg.startswith("+"):
                added.add(chg[1:])
        for i in added:
            for j in added:
                if i == j:
                    continue
                if (i, j) in all_rules:
                    all_rules[(i, j)]["addedTogether"] += 1
                if (j, i) in all_rules:
                    all_rules[(j, i)]["addedTogether"] += 1
for seq in seqs:
    lib_added = set()
    for item in seq["seq"]:
        for chg in item["changes"]:
            if chg.startswith("+"):
                lib_added.add(chg[1:])
    for i in lib_added:
        for j in lib_added:
            if i == j:
                continue
            if (i, j) in all_rules:
                all_rules[(i, j)]["concurrence"] += 1
            if (j, i) in all_rules:
                all_rules[(j, i)]["concurrence"] += 1
for rule in all_rules:
    if rule in api_support:
        all_rules[rule]["apiCounter"] = api_support[rule]
    all_rules[rule]["distance"] = ";".join([str(x) for x in all_rules[rule]["distance"]])
len(all_rules)

653309

In [11]:
all_rules = pd.DataFrame.from_dict(all_rules, orient="index")
all_rules.index.set_names(["fromLib", "toLib"], inplace=True)
all_rules["isPossible"] = False
all_rules["isConfirmed"] = False
for from_lib, to_lib in zip(possible_rules["fromGroupArtifact"], possible_rules["toGroupArtifact"]):
    if (from_lib, to_lib) not in all_rules.index:
        continue
    all_rules.loc[(from_lib, to_lib), "isPossible"] = True
for from_lib, to_lib in zip(confirmed_rules["fromGroupArtifact"], confirmed_rules["toGroupArtifact"]):
    if (from_lib, to_lib) not in all_rules.index:
        continue
    all_rules.loc[(from_lib, to_lib), "isConfirmed"] = True

In [12]:
all_rules.to_csv("all-rules-with-metrics.csv")