# RQ1: How many possible ground truth rules exist in data?

In [17]:
import pymongo
import random
import pandas as pd
import re

In [18]:
db = pymongo.MongoClient("mongodb://migration_helper:HeHMgt2020@da1.eecs.utk.edu:27020/migration_helper"
                           "?authSource=migration_helper").migration_helper

In [19]:
db.wocDepSeq3.count_documents({})

404339

In [20]:
repo_names = set()
seqs = []
for seq in db.wocDepSeq3.find():
    repo_names.add(seq["repoName"])
    seqs.append(seq)
print("#seqs = {}, #repos = {}".format(len(seqs), len(repo_names)))

#seqs = 404339, #repos = 21358


In [21]:
print("#seqs len >= 2: {}".format(len([x for x in seqs if len(x["seq"]) >= 2])))
print("#seqs len >= 3: {}".format(len([x for x in seqs if len(x["seq"]) >= 3])))

#seqs len >= 2: 147220
#seqs len >= 3: 102199


In [22]:
seqs = [x for x in seqs if len(x["seq"]) >= 2]
repo_names = set(seq["repoName"] for seq in seqs)
print("#seqs = {}, #repos = {}".format(len(seqs), len(repo_names)))

#seqs = 147220, #repos = 14239


In [23]:
# Load possible ground truth data
possible_ground_truth = pd.read_csv("possible-ground-truth-2014.csv")
from2tolibs = dict()
for from_lib, rows in possible_ground_truth.groupby(by="fromGroupArtifact"):
    from2tolibs[from_lib] = list(rows["toGroupArtifact"])
len(possible_ground_truth)

3878

In [24]:
possible_migrations = []
for seq in seqs:
    for i, item in enumerate(seq["seq"]):
        for chg in item["changes"]:
            if chg.startswith("-") and chg[1:] in from2tolibs:
                from_lib = chg[1:]
                end_id = i
                start_id = 0
                for j in range(0, i + 1):
                    if "+" + from_lib in seq["seq"][j]["changes"]:
                        start_id = j
                for k in range(start_id, end_id + 1):
                    for to_lib in from2tolibs[from_lib]:
                        if "+" + to_lib in seq["seq"][k]["changes"]:
                            migration = {
                                "fromLib": from_lib,
                                "toLib": to_lib,
                                "repoName": seq["repoName"], 
                                "fileName": seq["fileName"],
                                "startCommit": seq["seq"][k]["commit"],
                                "endCommit": seq["seq"][end_id]["commit"],
                                "startCommitChanges": "\n".join(seq["seq"][k]["changes"]),
                                "endCommitChanges": "\n".join(seq["seq"][end_id]["changes"]),
                            }
                            possible_migrations.append(migration) 
possible_migrations = pd.DataFrame(possible_migrations)
print("# total = {}, # rules = {}, # repos = {}, # different start commit = {}".format(
    len(possible_migrations),
    len(set(zip(possible_migrations["fromLib"], possible_migrations["toLib"]))),
    len(set(possible_migrations["repoName"])),
    len(set(possible_migrations["startCommit"]))))

# total = 86048, # rules = 1588, # repos = 3325, # different start commit = 14255


In [25]:
rules = zip(possible_ground_truth["fromGroupArtifact"], possible_ground_truth["toGroupArtifact"])
rules_in_depseq = set(zip(possible_migrations["fromLib"], possible_migrations["toLib"]))
possible_ground_truth["inDepSeq"] = [x in rules_in_depseq for x in rules]
possible_ground_truth.to_csv("possible-ground-truth-2014.csv", index=False)

In [26]:
possible_migrations.to_csv("possible-migrations.csv", index=False)

We add commit message and timestamp to `possible-migrations.csv`, using `scripts/extend_possible_migrations.py`, then we do the tagging.

In [None]:
samples = []
for (from_lib, to_lib), rows in possible_migrations.groupby(by=["fromLib", "toLib"]):
    # print(from_lib, to_lib, len(rows), len(set(rows["repoName"])), len(set(rows["startCommit"])))
    repos = list(set(rows["repoName"]))
    sample_size = min(len(repos), max(10, int(len(repos) * 0.10)))
    random.shuffle(repos)
    sample_repos = repos[0:sample_size]
    for repo_name, rows_by_repo in rows.groupby(by="repoName"):
        if repo_name in sample_repos:
            samples.append(rows_by_repo.sample(min(len(rows_by_repo), 5)).iloc[0])
print(len(samples))

In [None]:
pd.DataFrame(samples).to_csv("possible-migrations-sampled.csv", index=False)

In [51]:
def get_lib_parts(lib_name: str) -> list:
    parts = re.findall(r"[\w']+", lib_name.lower())
    useless = ["com", "org", "impl"]
    parts = [x for x in parts if len(x) >= 3 and x not in useless]
    return parts
def seem_like_migration(from_lib: str, to_lib: str, start_msg: str, end_msg: str) -> bool:
    start_msg = start_msg.lower()
    end_msg = end_msg.lower()
    from_lib_parts = get_lib_parts(from_lib)
    to_lib_parts = get_lib_parts(to_lib)
    add_keywords = ["use", "adopt", "introduc", "upgrad", "updat", "采用", "升级"]
    remove_keywords = ["remove", "abandon"]
    migration_keywords = ["migrat", "switch", "replac", "instead", "move", "swap"
                          "unify", "convert", "chang", "迁移", "替换", "修改"]
    cleanup_keywords = ["pom", "clean", "remove"]
    if start_msg == end_msg:
        if any(x in start_msg for x in to_lib_parts):
            if (any(x in start_msg for x in from_lib_parts) 
                or any(x in start_msg for x in migration_keywords) 
                or any(x in start_msg for x in add_keywords)):
                return True
        if any(x in start_msg for x in from_lib_parts) and any(x in start_msg for x in migration_keywords):
            return True
    else:
        if (any(x in start_msg for x in from_lib_parts) and any(x in start_msg for x in add_keywords)
            and any(x in end_msg for x in to_lib_parts) and any(x in end_msg for x in remove_keywords)):
            return True
        """
        if (any(x in start_msg for x in from_lib_parts) 
            and any(x in start_msg for x in to_lib_parts) 
            and any(x in end_msg for x in cleanup_keywords)):
            return True
        if (any(x in end_msg for x in from_lib_parts) 
            and any(x in end_msg for x in to_lib_parts) 
            and any(x in start_msg for x in add_keywords)
            and any(x in start_msg for x in from_lib_parts)):
            return True
        """
    return False
confirmed_migrations = pd.read_excel("manual/confirmed-migrations-initial-examples.xlsx")
msgs = set(confirmed_migrations["startCommit"]) | set(confirmed_migrations["endCommit"])
seem_true = []
for idx, row in confirmed_migrations.iterrows():
    seem_true.append(seem_like_migration(row["fromLib"], row["toLib"] ,row["startCommitMessage"], row["endCommitMessage"]))
print("Recall: {}".format(len([x for x in seem_true if x is True]) / len(seem_true)))
confirmed_migrations["seemTrue"] = seem_true

Recall: 0.7475247524752475


In [52]:
possible_migrations = pd.read_csv("possible-migrations.csv").fillna("")
seem_true = []
for idx, row in possible_migrations.iterrows():
    seem_true.append(seem_like_migration(row["fromLib"], row["toLib"] ,row["startCommitMessage"], row["endCommitMessage"]))
possible_migrations["seemTrue"] = seem_true
possible_migrations[possible_migrations["seemTrue"] == True].to_csv("possible-migrations-filtered.csv", index=False)
len(possible_migrations[possible_migrations["seemTrue"] == True])

5200

In [53]:
migrations = possible_migrations[possible_migrations["seemTrue"] == True]
confirmed_rules = set(zip(migrations["fromLib"], migrations["toLib"]))
confirmed_from_libs = set(migrations["fromLib"])
confirmed_to_libs = set(migrations["toLib"])
len(confirmed_rules), len(confirmed_from_libs), len(confirmed_to_libs)

(293, 98, 101)

In [54]:
xyl_output = pd.read_csv("recommend-output-xyl.csv")
len(set(xyl_output["fromGroupArtifact"]) & confirmed_from_libs)

57