In [1]:
import sys
import os

import pandas as pd

sys.path.append("..")
from deicide import jdeo, gri
from deicide.validation2 import *
from deicide.loading import load_dataset, load_full_candidates_df
from deicide.deicide import cluster_dataset
from deicide.dendrogram import dump_indicators

In [2]:
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [3]:
def validate_subjects(
    subjects: pd.DataFrame, db_dir: str, jd_dir: str, results_dir: str, suffix: str
):
    if os.path.exists(results_dir):
        raise RuntimeError("the results dir '{}' already exists".format(results_dir))
    os.makedirs(results_dir)

    res = subjects.copy()
    res["my_blocks"] = ""
    res["unmatched_jd_methods"] = ""
    res["comparable_methods"] = ""
    res["comparable_commits"] = ""
    res["comparable_authors"] = ""
    res["my_comparable_clusters"] = ""
    res["jd_comparable_clusters"] = ""
    res["my_commit_sim"] = ""
    res["jd_commit_sim"] = ""
    res["my_author_sim"] = ""
    res["jd_author_sim"] = ""

    for i, (subject_id, row) in enumerate(subjects.iterrows()):
        project = row["project"]
        filename = row["filename"]
        print("Working on Subject #{}: ({}, {})".format(i + 1, project, filename))

        # Define helper function for setting output values
        def set_cell(col_name, value):
            res.at[subject_id, col_name] = value

        # Load JD clustering
        jd_rows = jdeo.load_jdeo_subject(jd_dir, project, filename)

        # Skip if not in JD data
        if len(jd_rows) == 0:
            print("Missing from JD. Skipping...")
            continue

        # Run our clustering algorithm
        ds = load_dataset(f"{db_dir}{project}.db", filename, row["commit_id"])
        entities_df = cluster_dataset(ds)
        targets_df = entities_df[~(entities_df["kind"] == "file")]
        # entities_df.to_csv(os.path.join(results_dir, "{}.csv".format(subject_id)))
        set_cell("my_blocks", targets_df.groupby("block_name").ngroups)

        # Attempt to match JD entity data with ours
        jd_id_map = jdeo.match_entities(jd_rows, ds.targets_df)
        unmatched_jd_methods = sum(
            1 for row, id in jd_id_map.items() if id is None and row.kind == "method"
        )
        set_cell("unmatched_jd_methods", unmatched_jd_methods)

        if unmatched_jd_methods > 2:
            continue

        # Create clustering objects for each kind of clustering
        my_clustering = to_my_clustering(entities_df).without_root()
        jd_clustering = to_jdeo_clustering(jd_id_map).without_root()
        commit_clustering = to_commit_clustering(ds.touches_df)
        author_clustering = to_author_clustering(ds.touches_df)

        # Get methods in common
        method_ids = set(entities_df[entities_df["kind"] == "method"].index)
        my_entity_ids = my_clustering.entities()
        jd_entity_ids = jd_clustering.entities()
        commit_entity_ids = commit_clustering.entities()
        author_entity_ids = author_clustering.entities()
        common_method_ids = (
            method_ids
            & my_entity_ids
            & jd_entity_ids
            & commit_entity_ids
            & author_entity_ids
        )

        # For debug / diagram purposes
        # client_clustering = to_client_clustering(ds.clients_df, ds.client_deps_df).subset(common_method_ids)
        my_clustering_with_files = to_my_clustering_with_files(entities_df).without_root()

        # ...
        my_subclustering = my_clustering.subset(common_method_ids)
        jd_subclustering = jd_clustering.subset(common_method_ids)
        commit_subclustering = commit_clustering.subset(common_method_ids)
        author_subclustering = author_clustering.subset(common_method_ids)

        # Remove any cluster that 1) contains only a single entity AND 2) is not that entities only cluster
        my_subclustering = my_subclustering.normalize()
        jd_subclustering = jd_subclustering.normalize()

        # Dump
        dump_indicators(
            entities_df.copy(),
            os.path.join(results_dir, f"{subject_id}.xlsx"),
            my_clx=my_clustering,
            # my_clx_without_singletons=my_clustering.normalize(),
            # my_clx_final=my_subclustering,
            # my_clx_with_files = my_clustering_with_files,
            # jd_clx=jd_clustering,
            # jd_clx_without_singletons=jd_clustering.normalize(),
            # jd_clx_final=jd_subclustering,
            # commit_clx=commit_clustering,
            # author_clx=author_clustering,
            # client_clx=client_clustering
        )

        # Create matrices from each kind of clustering
        my_arr = my_subclustering.expand().ndarray()
        jd_arr = jd_subclustering.expand().ndarray()
        commit_arr = commit_subclustering.ndarray()
        author_arr = author_subclustering.ndarray()

        # Record the sizes of these matrices
        set_cell("comparable_methods", len(common_method_ids))
        set_cell("comparable_commits", commit_arr.shape[0])
        set_cell("comparable_authors", author_arr.shape[0])
        set_cell("my_comparable_clusters", my_arr.shape[0])
        set_cell("jd_comparable_clusters", jd_arr.shape[0])

        # Calculate the similarities
        set_cell("my_commit_sim", gri.grand_index(my_arr, commit_arr))
        set_cell("jd_commit_sim", gri.grand_index(jd_arr, commit_arr))
        set_cell("my_author_sim", gri.grand_index(my_arr, author_arr))
        set_cell("jd_author_sim", gri.grand_index(jd_arr, author_arr))

    print(sorted(set(res["project"])))

    # Unable to match more than two methods reported by JD to our data.
    # One cause of this is that JD will list members of nested classes as
    # members of the top level class. Wheras we only consider members
    # directly inside the top-level class.
    res = res[res["unmatched_jd_methods"] < 3]
    res["short_filename"] = [f.split("/")[-1] for f in res["filename"]]
    res.to_csv(os.path.join(results_dir, f"_summary{suffix}.csv"))


In [4]:
RESULTS_DIR = "results/"
DB_DIR = "data-db/"
JD_DIR = "data-jd/"

In [5]:
# Discover subjects
cdf = load_full_candidates_df(DB_DIR, JD_DIR)
subjects_df = cdf[(cdf["loc_pct"] >= 90) & (cdf["loc"] >= 500) & (cdf["commits"] >= 10) & (cdf["authors"] >= 2) & cdf["in_jdeo_data"]].copy()

Finding candidates in activemq.db...
Finding candidates in calcite.db...
Finding candidates in camel.db...
Finding candidates in dolphinscheduler.db...
Finding candidates in drill.db...
Finding candidates in dubbo.db...
Finding candidates in hudi.db...
Finding candidates in ignite.db...
Finding candidates in iotdb.db...
Finding candidates in kafka.db...
Finding candidates in linkis.db...
Finding candidates in logging-log4j2.db...
Finding candidates in nifi.db...
Finding candidates in pinot.db...
Finding candidates in pulsar.db...
Finding candidates in shardingsphere.db...
Finding candidates in shenyu.db...


In [6]:
# Drop some less important columns to produce an easier to read CSV
labels = [c for c in subjects_df.columns if c.endswith("_pct")] + ["in_jdeo_data"]
subjects_df.drop(labels, axis=1, inplace=True)

In [7]:
subjects_df

Unnamed: 0_level_0,project,commit_id,filename,loc,members,fan_in,commits,churn,authors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
pinot-947,pinot,2379,pinot-controller/src/main/java/org/apache/pino...,3633,212,110,122,11630,36
logging-log4j2-3381,logging-log4j2,960,log4j-api/src/main/java/org/apache/logging/log...,2848,474,112,15,2478,6
iotdb-2037,iotdb,4105,tsfile/src/main/java/org/apache/iotdb/tsfile/r...,2148,103,96,77,10973,23
shenyu-1385,shenyu,1853,shenyu-common/src/main/java/org/apache/shenyu/...,1745,64,25,27,7749,8
dubbo-198,dubbo,1499,dubbo-common/src/main/java/org/apache/dubbo/co...,1686,231,884,39,17144,13
...,...,...,...,...,...,...,...,...,...
pinot-11553,pinot,2379,pinot-controller/src/main/java/org/apache/pino...,523,38,4,10,2417,6
iotdb-16992,iotdb,4105,schema-engine-tag/src/main/java/org/apache/iot...,517,57,1,23,4003,6
pinot-3741,pinot,2379,pinot-spi/src/main/java/org/apache/pinot/spi/d...,513,58,618,24,1284,12
pinot-2448,pinot,2379,pinot-server/src/main/java/org/apache/pinot/se...,513,38,1,38,1875,15


In [8]:
# Get the result_dir
os.makedirs(RESULTS_DIR, exist_ok=True)
result_nums = [int(n.split("-")[1]) for n in os.listdir(RESULTS_DIR) if n.startswith("results-")]
result_num = max(result_nums, default=0) + 1
result_dir = os.path.join(RESULTS_DIR, f"results-{result_num}")

In [9]:
validate_subjects(subjects_df, DB_DIR, JD_DIR, result_dir, f"-{result_num}")

Working on Subject #1: (pinot, pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java)
[R.W0]             (20:00:21) Starting... (299 nodes)	Bisected with a cut weight of 1006090.0 in 10.2063 secs.
[R.W0.A]           (20:00:31) Starting... (198 nodes)	Bisected with a cut weight of 916935.0 in 6.1858 secs.
[R.W0.A.A]         (20:00:37) Starting... (84 nodes)	Bisected with a cut weight of 341525.0 in 0.4343 secs.
[R.W0.A.A.A]       (20:00:38) Starting... (30 nodes)	Bisected with a cut weight of 123794.0 in 0.1931 secs.
[R.W0.A.A.A.A]     (20:00:38) Starting... (10 nodes)	Bisected with a cut weight of 41064.0 in 0.1756 secs.
[R.W0.A.A.A.A.A]   (20:00:38) Starting... (7 nodes)	Aborted. Weight under threshold.
[R.W0.A.A.A.A.B]   (20:00:38) Starting... (3 nodes)	Aborted. Weight under threshold.
[R.W0.A.A.A.B]     (20:00:38) Starting... (20 nodes)	Bisected with a cut weight of 72914.0 in 0.1716 secs.
[R.W0.A.A.A.B.A]   (20:00:38) Starting... (9 no

KeyError: 31758