# RQ1: How many possible ground truth rules exist in data?

In [1]:
import pymongo
import random
import pandas as pd

In [2]:
db = pymongo.MongoClient("mongodb://migration_helper:HeHMgt2020@da1.eecs.utk.edu:27020/migration_helper"
                           "?authSource=migration_helper").migration_helper

In [3]:
db.wocDepSeq.count_documents({})

404339

In [4]:
repo_names = set()
seqs = []
for seq in db.wocDepSeq.find():
    repo_names.add(seq["repoName"])
    seqs.append(seq)
print("#seqs = {}, #repos = {}".format(len(seqs), len(repo_names)))

#seqs = 404339, #repos = 21358


In [5]:
print("#seqs len >= 2: {}".format(len([x for x in seqs if len(x["seq"]) >= 2])))
print("#seqs len >= 3: {}".format(len([x for x in seqs if len(x["seq"]) >= 3])))

#seqs len >= 2: 131317
#seqs len >= 3: 88355


In [6]:
seqs = [x for x in seqs if len(x["seq"]) >= 2]
repo_names = set(seq["repoName"] for seq in seqs)
print("#seqs = {}, #repos = {}".format(len(seqs), len(repo_names)))

#seqs = 131317, #repos = 14087


In [7]:
# Load possible ground truth data
possible_ground_truth = pd.read_csv("possible-ground-truth-2014.csv")
from2tolibs = dict()
for from_lib, rows in possible_ground_truth.groupby(by="fromGroupArtifact"):
    from2tolibs[from_lib] = list(rows["toGroupArtifact"])
len(possible_ground_truth)

1308

In [8]:
possible_migrations = []
for seq in seqs:
    for i, item in enumerate(seq["seq"]):
        for chg in item["changes"]:
            if chg.startswith("-") and chg[1:] in from2tolibs:
                from_lib = chg[1:]
                end_id = i
                start_id = 0
                for j in range(0, i + 1):
                    if "+" + from_lib in seq["seq"][j]["changes"]:
                        start_id = j
                for k in range(start_id, end_id + 1):
                    for to_lib in from2tolibs[from_lib]:
                        if "+" + to_lib in seq["seq"][k]["changes"]:
                            migration = {
                                "fromLib": from_lib,
                                "toLib": to_lib,
                                "repoName": seq["repoName"], 
                                "fileName": seq["fileName"],
                                "startCommit": seq["seq"][k]["commit"],
                                "endCommit": seq["seq"][end_id]["commit"],
                                "startCommitChanges": "\n".join(seq["seq"][k]["changes"]),
                                "endCommitChanges": "\n".join(seq["seq"][end_id]["changes"]),
                            }
                            possible_migrations.append(migration) 
possible_migrations = pd.DataFrame(possible_migrations)
print("# total = {}, # rules = {}, # repos = {}, # different start commit = {}".format(
    len(possible_migrations),
    len(set(zip(possible_migrations["fromLib"], possible_migrations["toLib"]))),
    len(set(possible_migrations["repoName"])),
    len(set(possible_migrations["startCommit"]))))

# total = 209356, # rules = 768, # repos = 4239, # different start commit = 32816


In [9]:
possible_migrations.to_csv("possible-migrations.csv", index=False)

In [10]:
samples = []
for (from_lib, to_lib), rows in possible_migrations.groupby(by=["fromLib", "toLib"]):
    # print(from_lib, to_lib, len(rows), len(set(rows["repoName"])), len(set(rows["startCommit"])))
    repos = list(set(rows["repoName"]))
    sample_size = min(len(repos), max(10, int(len(repos) * 0.10)))
    random.shuffle(repos)
    sample_repos = repos[0:sample_size]
    for repo_name, rows_by_repo in rows.groupby(by="repoName"):
        if repo_name in sample_repos:
            samples.append(rows_by_repo.sample(min(len(rows_by_repo), 5)).iloc[0])
print(len(samples))

5998


In [11]:
pd.DataFrame(samples).to_csv("possible-migrations-sampled.csv", index=False)

We add commit message and timestamp to `possible-migrations-samples.csv`, using `scripts/extend_possible_migrations.py`, then we do the tagging.