In [1]:
import sys
import os
from statistics import median_high

import pandas as pd

from deicide import jdeo, gri
from deicide.algorithms import akash19
from deicide.validation2 import *
from deicide.loading import load_dataset, load_full_candidates_df
from deicide.deicide import cluster_dataset
from deicide.dendrogram import dump_indicators

In [2]:
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [3]:
def validate_subjects(
    subjects: pd.DataFrame, db_dir: str, jd_dir: str, results_dir: str, suffix: str
):
    if os.path.exists(results_dir):
        raise RuntimeError("the results dir '{}' already exists".format(results_dir))
    os.makedirs(results_dir)

    res = subjects.copy()
    res["my_blocks"] = ""
    res["unmatched_jd_methods"] = ""
    res["comparable_methods"] = ""
    res["comparable_commits"] = ""
    res["comparable_authors"] = ""
    res["my_comparable_clusters"] = ""
    res["jd_comparable_clusters"] = ""
    res["my_commit_sim"] = ""
    res["jd_commit_sim"] = ""
    res["ak_commit_sim"] = ""
    res["my_author_sim"] = ""
    res["jd_author_sim"] = ""
    res["ak_author_sim"] = ""

    for i, (subject_id, row) in enumerate(subjects.iterrows()):
        project = row["project"]
        filename = row["filename"]
        print("Working on Subject #{}: ({}, {})".format(i + 1, project, filename))

        # Define helper function for setting output values
        def set_cell(col_name, value):
            res.at[subject_id, col_name] = value

        # Load JD clustering
        jd_rows = jdeo.load_jdeo_subject(jd_dir, project, filename)

        # Skip if not in JD data
        if len(jd_rows) == 0:
            print("Missing from JD. Skipping...")
            continue

        # Run our clustering algorithm
        ds = load_dataset(f"{db_dir}{project}.db", filename, row["commit_id"])
        entities_df = cluster_dataset(ds)
        targets_df = entities_df[~(entities_df["kind"] == "file")]
        # entities_df.to_csv(os.path.join(results_dir, "{}.csv".format(subject_id)))
        set_cell("my_blocks", targets_df.groupby("block_name").ngroups)

        # Attempt to match JD entity data with ours
        jd_id_map = jdeo.match_entities(jd_rows, ds.targets_df)
        unmatched_jd_methods = sum(
            1 for row, id in jd_id_map.items() if id is None and row.kind == "method"
        )
        set_cell("unmatched_jd_methods", unmatched_jd_methods)

        if unmatched_jd_methods > 2:
            continue

        # Run the Akash19 clustering algorithm
        print("Running Akash19...")
        ak_clustering = akash19.akash19(akash19.to_godclass(ds), 2.0, 0)
        # ak_clusterings = akash19.many_akash19(akash19.to_godclass(ds), 8, (0.3, 0.9))

        # Create clustering objects for each kind of clustering
        my_clustering = to_my_clustering(entities_df).without_root()
        jd_clustering = to_jdeo_clustering(jd_id_map).without_root()
        commit_clustering = to_commit_clustering(ds.touches_df)
        author_clustering = to_author_clustering(ds.touches_df)

        # Get methods in common
        method_ids = set(entities_df[entities_df["kind"].isin(["method", "constructor"])].index)
        my_entity_ids = my_clustering.entities()
        jd_entity_ids = jd_clustering.entities()
        ak_entity_ids = ak_clustering.entities()
        commit_entity_ids = commit_clustering.entities()
        author_entity_ids = author_clustering.entities()
        common_method_ids = (
            method_ids
            & my_entity_ids
            & jd_entity_ids
            & ak_entity_ids
            & commit_entity_ids
            & author_entity_ids
        )

        # For debug / diagram purposes
        # client_clustering = to_client_clustering(ds.clients_df, ds.client_deps_df).subset(common_method_ids)
        my_clustering_with_files = to_my_clustering_with_files(entities_df).without_root()

        # ...
        my_subclustering = my_clustering.subset(common_method_ids)
        jd_subclustering = jd_clustering.subset(common_method_ids)
        ak_subclustering = ak_clustering.subset(common_method_ids)
        # ak_subclusterings = [c.subset(common_method_ids) for c in ak_clusterings]
        commit_subclustering = commit_clustering.subset(common_method_ids)
        author_subclustering = author_clustering.subset(common_method_ids)

        # Remove any cluster that 1) contains only a single entity AND 2) is not that entities only cluster
        my_subclustering = my_subclustering.normalize()
        jd_subclustering = jd_subclustering.normalize()
        ak_subclustering = ak_subclustering.normalize()
        # ak_subclusterings = [c.normalize() for c in ak_subclusterings]

        # Create matrices from each kind of clustering
        my_arr = my_subclustering.expand().ndarray()
        jd_arr = jd_subclustering.expand().ndarray()
        ak_arr = ak_subclustering.expand().ndarray()
        # ak_arrs = [c.expand().ndarray() for c in ak_subclusterings]
        commit_arr = commit_subclustering.ndarray()
        author_arr = author_subclustering.ndarray()

        # Record the sizes of these matrices
        set_cell("comparable_methods", len(common_method_ids))
        set_cell("comparable_commits", commit_arr.shape[0])
        set_cell("comparable_authors", author_arr.shape[0])
        set_cell("my_comparable_clusters", my_arr.shape[0])
        set_cell("jd_comparable_clusters", jd_arr.shape[0])

        # Calculate the similarities
        set_cell("my_commit_sim", gri.grand_index(my_arr, commit_arr))
        set_cell("jd_commit_sim", gri.grand_index(jd_arr, commit_arr))
        set_cell("ak_commit_sim", gri.grand_index(ak_arr, commit_arr))
        set_cell("my_author_sim", gri.grand_index(my_arr, author_arr))
        set_cell("jd_author_sim", gri.grand_index(jd_arr, author_arr))
        set_cell("ak_author_sim", gri.grand_index(ak_arr, author_arr))

        # Find best Akash19
        # ak_scores = [(i, gri.grand_index(arr, commit_arr)) for i, arr in enumerate(ak_arrs)]
        # ak_clustering_median = median_high([s for _, s in ak_scores])
        # ak_clustering_ix = min([i for i, s in ak_scores if s == ak_clustering_median])
        # # ak_clustering_ix = max(ak_scores, key=lambda x: x[1])[0]
        # ak_clustering = ak_clusterings[ak_clustering_ix]
        # ak_subclustering = ak_subclusterings[ak_clustering_ix]
        # ak_arr = ak_arrs[ak_clustering_ix]
        # set_cell("ak_commit_sim", gri.grand_index(ak_arr, commit_arr))
        # set_cell("ak_author_sim", gri.grand_index(ak_arr, author_arr))

        # Dump
        dump_indicators(
            entities_df.copy(),
            os.path.join(results_dir, f"{subject_id}.xlsx"),
            my_clx=my_clustering,
            my_clx_without_singletons=my_clustering.normalize(),
            my_clx_final=my_subclustering,
            # my_clx_with_files = my_clustering_with_files,
            jd_clx=jd_clustering,
            jd_clx_without_singletons=jd_clustering.normalize(),
            jd_clx_final=jd_subclustering,
            ak_clx=ak_clustering,
            ak_clx_without_singletons=ak_clustering.normalize(),
            ak_clx_final=ak_subclustering,
            # commit_clx=commit_clustering,
            # author_clx=author_clustering,
            # client_clx=client_clustering
        )

    print(sorted(set(res["project"])))

    # Unable to match more than two methods reported by JD to our data.
    # One cause of this is that JD will list members of nested classes as
    # members of the top level class. Wheras we only consider members
    # directly inside the top-level class.
    res = res[res["unmatched_jd_methods"] < 3]
    res["short_filename"] = [f.split("/")[-1] for f in res["filename"]]
    res.to_csv(os.path.join(results_dir, f"_summary{suffix}.csv"))


In [4]:
RESULTS_DIR = "results/"
DB_DIR = "data/"
JD_DIR = "data-jd/"

In [5]:
# Discover subjects
cdf = load_full_candidates_df(DB_DIR, JD_DIR)
subjects_df = cdf[(cdf["loc_pct"] >= 90) & (cdf["loc"] >= 500) & (cdf["commits"] >= 10) & (cdf["authors"] >= 2) & cdf["in_jdeo_data"]].copy()

Finding candidates in activemq.db...
Finding candidates in calcite.db...
Finding candidates in camel.db...
Finding candidates in dolphinscheduler.db...
Finding candidates in drill.db...
Finding candidates in dubbo.db...
Finding candidates in hudi.db...
Finding candidates in ignite.db...
Finding candidates in iotdb.db...
Finding candidates in kafka.db...
Finding candidates in linkis.db...
Finding candidates in logging-log4j2.db...
Finding candidates in nifi.db...
Finding candidates in pinot.db...
Finding candidates in pulsar.db...
Finding candidates in shardingsphere.db...
Finding candidates in shenyu.db...


In [6]:
# Drop some less important columns to produce an easier to read CSV
labels = [c for c in subjects_df.columns if c.endswith("_pct")] + ["in_jdeo_data"]
subjects_df.drop(labels, axis=1, inplace=True)

In [7]:
subjects_df

Unnamed: 0_level_0,project,commit_id,filename,loc,members,fan_in,commits,churn,authors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
pinot-11,pinot,8576,pinot-controller/src/main/java/org/apache/pino...,3633,212,110,210,29925,49
activemq-930,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,3170,409,1082,131,15429,26
logging-log4j2-2930,logging-log4j2,10412,log4j-api/src/main/java/org/apache/logging/log...,2848,474,112,148,39592,21
activemq-8854,activemq,9610,activemq-client/src/main/java/org/apache/activ...,2548,275,225,60,9620,14
activemq-571,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,2377,149,43,187,16608,22
...,...,...,...,...,...,...,...,...,...
activemq-55178,activemq,9610,activemq-client/src/main/java/org/apache/activ...,510,90,172,17,1564,5
dubbo-11605,dubbo,6257,dubbo-common/src/main/java/org/apache/dubbo/co...,506,103,88,29,1922,12
activemq-36012,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,503,85,108,23,2248,12
logging-log4j2-8430,logging-log4j2,10412,log4j-api/src/main/java/org/apache/logging/log...,503,51,39,39,2266,15


In [8]:
# Get the result_dir
os.makedirs(RESULTS_DIR, exist_ok=True)
result_nums = [int(n.split("-")[1]) for n in os.listdir(RESULTS_DIR) if n.startswith("results-")]
result_num = max(result_nums, default=0) + 1
result_dir = os.path.join(RESULTS_DIR, f"results-{result_num}")

In [9]:
validate_subjects(subjects_df, DB_DIR, JD_DIR, result_dir, f"-{result_num}")

Working on Subject #1: (pinot, pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java)
[R.W0]             (00:27:52) Starting... (299 nodes)	Bisected with a cut weight of 994923.0 in 8.4846 secs.
[R.W0.A]           (00:28:01) Starting... (196 nodes)	Bisected with a cut weight of 971977.0 in 4.7409 secs.
[R.W0.A.A]         (00:28:06) Starting... (93 nodes)	Bisected with a cut weight of 439340.0 in 0.7237 secs.
[R.W0.A.A.A]       (00:28:06) Starting... (36 nodes)	Bisected with a cut weight of 123794.0 in 0.2313 secs.
[R.W0.A.A.A.A]     (00:28:07) Starting... (10 nodes)	Bisected with a cut weight of 41064.0 in 0.1722 secs.
[R.W0.A.A.A.A.A]   (00:28:07) Starting... (7 nodes)	Aborted. Weight under threshold.
[R.W0.A.A.A.A.B]   (00:28:07) Starting... (3 nodes)	Aborted. Weight under threshold.
[R.W0.A.A.A.B]     (00:28:07) Starting... (26 nodes)	Bisected with a cut weight of 79894.0 in 0.1806 secs.
[R.W0.A.A.A.B.A]   (00:28:07) Starting... (9 node