In [1]:
import pandas as pd
import numpy as np
import pymongo

In [32]:
def parse_repo_commit(raw_str):
    result = []
    for item in raw_str.split(" "):
        if len(item) == 0: 
            continue
        repo, start, end, file = item.split(";")
        result.append((repo, start, end, file))
    return result
rules = pd.read_csv("recommend-output.csv")
commits = pd.read_csv("test-recommend-output-wocDepSeq3-all-commits.csv").fillna("")
confirmed = pd.read_excel("manual/confirmed-migrations.xlsx")
confirmed_commit_pairs = set(zip(confirmed["startCommit"], confirmed["endCommit"]))
confirmed_rules = set(zip(confirmed["fromLib"], confirmed["toLib"]))
commits["possibleCommits"] = commits["possibleCommits"].apply(parse_repo_commit)

In [15]:
db = pymongo.MongoClient("mongodb://migration_helper:HeHMgt2020@da1.eecs.utk.edu:27020/migration_helper"
                           "?authSource=migration_helper").migration_helper

In [33]:
# Retrieve all commit metadata
commit_shas = set()
commit_metadata = dict()
for possible_commits in commits["possibleCommits"]:
    for repo, start, end, file in possible_commits:
        commit_shas.update((start, end))
for item in db.wocCommit.find({"_id": {"$in": list(commit_shas)}}):
    commit_metadata[item["_id"]] = item
len(commit_shas), len(commit_metadata)

(5287, 5287)

In [34]:
# Retrieve all dep seq change in that commit
commit_depchg = dict()
for seq in db.wocDepSeq3.find():
    for item in seq["seq"]:
        if item["commit"] in commit_shas:
            commit_depchg[(seq["repoName"], seq["fileName"], item["commit"])] = item["changes"]
len(commit_depchg)

24870

In [44]:
rule2count = {(row["fromGroupArtifact"], row["toGroupArtifact"]): row["ruleCount"] for idx, row in rules.iterrows()}
lib2info = {lib["name"]: lib for lib in db.lioProject.find()}

In [49]:
results = pd.read_csv("recommend-output.csv")
ground_truth = pd.read_csv("ground-truth.csv")
existing_rules = set(zip(ground_truth["fromLib"], ground_truth["toLib"]))
new_rules = []
for from_lib, rows in results.groupby(by="fromGroupArtifact"):
    for to_lib in rows["toGroupArtifact"][0:20]:
        rule_count = 0
        if (from_lib, to_lib) in rule2count:
            rule_count = rule2count[(from_lib, to_lib)]
        if (from_lib, to_lib) in existing_rules:
            continue
        new_rules.append({
            "fromLib": from_lib,
            "toLib": to_lib,
            "ruleCount": rule_count,
            "isPossible": False,
            "isConfirmed": False,
            "fromLibHomepageURL": lib2info[from_lib]["homepageUrl"],
            "toLibHomepageURL":lib2info[to_lib]["homepageUrl"],
            "fromLibDescription": lib2info[from_lib]["description"],
            "toLibDescription": lib2info[to_lib]["description"],
            "fromLibRepositoryURL": lib2info[from_lib]["repositoryUrl"],
            "toLibRepositoryURL": lib2info[to_lib]["repositoryUrl"],
            "fromLibRepositoryDescription": lib2info[from_lib]["repositoryDescription"],
            "toLibRepositoryDescription": lib2info[to_lib]["repositoryDescription"],
        })
new_rules = pd.DataFrame(new_rules)
len(new_rules)

4250

In [50]:
pd.DataFrame(new_rules).to_csv("new-rules.csv", index=False)

In [38]:
data = []
candidates = set(zip(new_rules["fromLib"], new_rules["toLib"]))
for from_lib, to_lib, possible_commits in zip(commits["fromLib"], commits["toLib"], commits["possibleCommits"]):
    for repo, start, end, file in possible_commits:
        if (start, end) in confirmed_commit_pairs:
            continue
        if (from_lib, to_lib) not in candidates:
            continue
        data.append({
            "fromLib": from_lib,
            "toLib": to_lib,
            "repoName": repo,
            "fileName": file,
            "startCommit": start,
            "endCommit": end,
            "startCommitChanges": "\n".join(commit_depchg[(repo, file, start)]),
            "endCommitChanges": "\n".join(commit_depchg[(repo, file, end)]),
            "startCommitMessage": commit_metadata[start]["message"],
            "endCommitMessage": commit_metadata[end]["message"],
            "startCommitTime": commit_metadata[start]["timestamp"],
            "endCommitTime": commit_metadata[end]["timestamp"],
        })
data = pd.DataFrame(data).sort_values(by=["startCommit", "fromLib"])
len(data)

2933

In [39]:
data.to_excel("possible-migrations-from-lib-2014-second-round.xlsx", index=False)

In [40]:
ground_truth = pd.read_csv("possible-ground-truth-2014.csv")
confirmed_migrations = pd.read_excel("manual/confirmed-migrations.xlsx")
confirmed_rules = set(zip(confirmed_migrations["fromLib"], confirmed_migrations["toLib"]))
rules = pd.read_csv("test-recommend-output-wocDepSeq3-all.csv")
len(confirmed_rules)

791

In [42]:
new_ground_truth = []
for idx, row in ground_truth.iterrows():
    from_lib, to_lib = row["fromGroupArtifact"], row["toGroupArtifact"]
    rule_count = 0
    if (from_lib, to_lib) in rule2count:
        rule_count = rule2count[(from_lib, to_lib)]
    new_ground_truth.append({
        "fromLib": from_lib,
        "toLib": to_lib,
        "ruleCount": rule_count,
        "isPossible": True,
        "isConfirmed": row["dataConfirmed"],
        "fromLibHomepageURL": lib2info[from_lib]["homepageUrl"],
        "toLibHomepageURL":lib2info[to_lib]["homepageUrl"],
        "fromLibDescription": lib2info[from_lib]["description"],
        "toLibDescription": lib2info[to_lib]["description"],
        "fromLibRepositoryURL": lib2info[from_lib]["repositoryUrl"],
        "toLibRepositoryURL": lib2info[to_lib]["repositoryUrl"],
        "fromLibRepositoryDescription": lib2info[from_lib]["repositoryDescription"],
        "toLibRepositoryDescription": lib2info[to_lib]["repositoryDescription"],
    })
for from_lib, to_lib in confirmed_rules:
    rule_count = 0
    if (from_lib, to_lib) in rule2count:
        rule_count = rule2count[(from_lib, to_lib)]
    if from_lib not in lib2info or to_lib not in lib2info:
        continue
    new_ground_truth.append({
        "fromLib": from_lib,
        "toLib": to_lib,
        "ruleCount": rule_count,
        "isPossible": True,
        "isConfirmed": True,
        "fromLibHomepageURL": lib2info[from_lib]["homepageUrl"],
        "toLibHomepageURL":lib2info[to_lib]["homepageUrl"],
        "fromLibDescription": lib2info[from_lib]["description"],
        "toLibDescription": lib2info[to_lib]["description"],
        "fromLibRepositoryURL": lib2info[from_lib]["repositoryUrl"],
        "toLibRepositoryURL": lib2info[to_lib]["repositoryUrl"],
        "fromLibRepositoryDescription": lib2info[from_lib]["repositoryDescription"],
        "toLibRepositoryDescription": lib2info[to_lib]["repositoryDescription"],
    })
pd.DataFrame(new_ground_truth).drop_duplicates().sort_values(
    by=["ruleCount", "fromLib", "toLib"], ascending=[False, True, True]).to_csv("ground-truth.csv", index=False)