In [1]:
import pandas as pd
import pymongo

In [6]:
result = pd.read_csv("recommend-output-all-repo-top-500.csv")
commits = pd.read_csv("recommend-output-all-repo-top-500-commits.csv")
existing = pd.read_excel("manual/extended-migrations-annotated.xlsx")
existing_rules = set(zip(existing["fromLib"], existing["toLib"]))
existing_to_libs = set(existing["toLib"])

In [7]:
output = pd.read_csv("recommend-output.csv")
len(output[output["isConfirmed"]]), len(output[output["isConfirmed"] & (output["confidence"] != 0)])

(773, 734)

In [10]:
libs = set()
with open("from-lib-repo-10.txt", "r") as f:
    for line in f:
        libs.add(line.strip())
ground_truth = pd.read_excel("manual/ground-truth.xlsx")

In [11]:
top_result = [rows[0:20] for from_lib, rows in result.groupby(by="fromLib")]
top_result = pd.concat(top_result, axis=0)
len(top_result), sum(top_result["possibleCommitCount"])

(9990, 15653)

In [12]:
top_result_filtered = top_result[
    (top_result["confidence"] != 0)
    & (~top_result["fromLib"].isin(set(ground_truth["fromLib"])))
]
top_result_filtered = top_result_filtered[
    top_result_filtered[["fromLib", "toLib"]].apply(lambda x: (x[0], x[1]) not in existing_rules, axis=1)
]
len(top_result_filtered), len(set(top_result_filtered["fromLib"]))

(2090, 393)

In [13]:
sum(top_result_filtered["possibleCommitCount"])

3081

In [14]:
top_result_filtered.merge(commits, on=["fromLib", "toLib"]).drop("repoCommits", axis=1).to_csv("possible-rules-extended.csv", index=False)

In [15]:
def parse_repo_commit(raw_str):
    result = []
    for item in raw_str.split(" "):
        if len(item) == 0: 
            continue
        try:
            repo, start, end, file = item.split(";")
            result.append((repo, start, end, file))
        except:
            print(item)
    return result
rules = pd.read_csv("possible-rules-extended.csv").fillna("")
rules["possibleCommits"] = rules["possibleCommits"].apply(parse_repo_commit)

GoatDroid-
GUI/pom.xml
GoatDroid-
GUI/pom.xml
GoatDroid-
GUI/pom.xml


In [16]:
db = pymongo.MongoClient("mongodb://migration_helper:HeHMgt2020@da1.eecs.utk.edu:27020/migration_helper"
                           "?authSource=migration_helper").migration_helper

In [17]:
# Retrieve all commit metadata
commit_shas = set()
commit_metadata = dict()
for possible_commits in rules["possibleCommits"]:
    for repo, start, end, file in possible_commits:
        commit_shas.update((start, end))
for item in db.wocCommit.find({"_id": {"$in": list(commit_shas)}}):
    commit_metadata[item["_id"]] = item
len(commit_shas), len(commit_metadata)

(617, 617)

In [18]:
# Retrieve all dep seq change in that commit
commit_depchg = dict()
for seq in db.wocDepSeq3.find():
    for item in seq["seq"]:
        if item["commit"] in commit_shas:
            commit_depchg[(seq["repoName"], seq["fileName"], item["commit"])] = item["changes"]
len(commit_depchg)

4235

In [19]:
rule2count = {(row["fromLib"], row["toLib"]): row["ruleCount"] for idx, row in rules.iterrows()}
lib2info = {lib["name"]: lib for lib in db.lioProject.find()}

In [20]:
data = []
for from_lib, to_lib, conf, possible_commits in zip(rules["fromLib"], rules["toLib"], rules["confidence"], rules["possibleCommits"]):
    for repo, start, end, file in possible_commits:
        if (repo, file, start) not in commit_depchg:
            continue
        data.append({
            "fromLib": from_lib,
            "toLib": to_lib,
            "fromLibDescription": lib2info[from_lib]["description"],
            "toLibDescription": lib2info[to_lib]["description"],
            "confidence": conf,
            "repoName": repo,
            "fileName": file,
            "startCommit": start,
            "endCommit": end,
            "startCommitChanges": "\n".join(commit_depchg[(repo, file, start)]),
            "endCommitChanges": "\n".join(commit_depchg[(repo, file, end)]),
            "startCommitMessage": commit_metadata[start]["message"],
            "endCommitMessage": commit_metadata[end]["message"],
            "startCommitTime": commit_metadata[start]["timestamp"],
            "endCommitTime": commit_metadata[end]["timestamp"],
        })
data = pd.DataFrame(data)
len(data)

3078

In [21]:
data.sort_values(
    by=["fromLib", "confidence", "startCommit"], 
    ascending=[True, False, True]
).to_excel("manual/extended-migrations.xlsx", index=False)