In [2]:
import os

import pandas as pd

from deicide import jdeo, gri
from deicide.algorithms import akash19, alzahrani20
from deicide.validation2 import *
from deicide.loading import load_dataset, load_full_candidates_df
from deicide.deicide import cluster_dataset
from deicide.dendrogram import dump_indicators
from deicide.dendro import to_dv8_clx, to_dv8_dsm

from _cacheutil import *

In [3]:
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [4]:
def validate_subjects(
    subjects: pd.DataFrame, db_dir: str, jd_dir: str, results_dir: str, suffix: str
):
    if os.path.exists(results_dir):
        raise RuntimeError("the results dir '{}' already exists".format(results_dir))
    os.makedirs(results_dir)

    res = subjects.copy()
    res["unmatched_jd_methods"] = ""
    res["dc_blocks"] = ""
    res["dc_total_blocks"] = ""
    res["jd_total_blocks"] = ""
    res["ak_total_blocks"] = ""
    res["al_total_blocks"] = ""
    res["dc_height"] = ""
    res["jd_height"] = ""
    res["ak_height"] = ""
    res["al_height"] = ""
    res["comparable_methods"] = ""
    # res["comparable_commits"] = ""
    # res["comparable_authors"] = ""
    # res["dc_comparable_clusters"] = ""
    # res["jd_comparable_clusters"] = ""
    res["dc1_commit_sim"] = ""
    res["dc2_commit_sim"] = ""
    res["jd_commit_sim"] = ""
    res["ak_commit_sim"] = ""
    res["al1_commit_sim"] = ""
    res["al2_commit_sim"] = ""
    res["dc1_author_sim"] = ""
    res["dc2_author_sim"] = ""
    res["jd_author_sim"] = ""
    res["ak_author_sim"] = ""
    res["al1_author_sim"] = ""
    res["al2_commit_sim"] = ""
    res["al3_author_sim"] = ""
    res["al4_commit_sim"] = ""

    for i, (subject_id, row) in enumerate(subjects.iterrows()): # type: ignore
        subject_id: str = str(subject_id)
        project: str = str(row["project"]) # type: ignore
        filename: str = str(row["filename"]) # type: ignore
        commit_id: int = int(row["commit_id"]) # type: ignore
        print("Working on Subject #{}: ({}, {})".format(i + 1, project, filename))

        # Define helper function for setting output values
        def set_cell(col_name: str, value: str | int | float):
            res.at[subject_id, col_name] = value # type: ignore

        # Load JD clustering
        jd_rows = jdeo.load_jdeo_subject(jd_dir, project, filename)

        # Skip if not in JD data
        if len(jd_rows) == 0:
            print("Missing from JD. Skipping...")
            continue

        # Load the dataset
        if is_dataset_cached(subject_id):
            print("Loading dataset from cache...")
            ds = load_dataset_from_cache(subject_id)
        else:
            print("Dataset not in cache. Loading from db...")
            ds = load_dataset(f"{db_dir}{project}.db", filename, commit_id)
            save_dataset_to_cache(subject_id, ds)

        # Attempt to match JD entity data with ours
        jd_id_map = jdeo.match_entities(jd_rows, ds.targets_df)
        unmatched_jd_methods = sum(
            1 for row, id in jd_id_map.items() if id is None and row.kind == "method"
        )
        set_cell("unmatched_jd_methods", unmatched_jd_methods)

        if unmatched_jd_methods > 2:
            continue

        # Run our clustering algorithm
        entities_df = load_or_create(f"dc-{subject_id}", lambda: cluster_dataset(ds, use_threshold=True))
        entities_nt_df = load_or_create(f"dc2-{subject_id}", lambda: cluster_dataset(ds, use_threshold=False))
        my_blocks = entities_df[~(entities_df["kind"] == "file")].groupby("block_name").ngroups # type: ignore
        # entities_df.to_csv(os.path.join(results_dir, "{}.csv".format(subject_id)))
        set_cell("dc_blocks", my_blocks)

        # Run the Akash19 clustering algorithm
        print("Running Akash19...")
        ak_clustering = load_or_create_clustering(
            f"{subject_id}-ak",
            lambda: akash19.akash19(akash19.to_godclass(ds), shuffle=False)
        )

        # Run the Alzahrani20 clustering algorithm
        print("Running Alzahrani20...")
        al1_clustering = load_or_create_clustering(
            f"{subject_id}-al1",
            lambda: alzahrani20.alzahrani20(alzahrani20.to_godclass(ds), use_merge=True, use_threshold=True, shuffle=False)
        )
        al2_clustering = load_or_create_clustering(
            f"{subject_id}-al2",
            lambda: alzahrani20.alzahrani20(alzahrani20.to_godclass(ds), use_merge=True, use_threshold=False, shuffle=False)
        )
        al3_clustering = load_or_create_clustering(
            f"{subject_id}-al3",
            lambda: alzahrani20.alzahrani20(alzahrani20.to_godclass(ds), use_merge=False, use_threshold=True, shuffle=False)
        )
        al4_clustering = load_or_create_clustering(
            f"{subject_id}-al4",
            lambda: alzahrani20.alzahrani20(alzahrani20.to_godclass(ds), use_merge=False, use_threshold=False, shuffle=False)
        )

        # Create clustering objects for each kind of clustering
        my1_clustering = to_my_clustering(entities_df).without_root()
        my2_clustering = to_my_clustering(entities_nt_df).without_root()
        jd_clustering = to_jdeo_clustering(jd_id_map).without_root()
        commit_clustering = to_commit_clustering(ds.touches_df)
        author_clustering = to_author_clustering(ds.touches_df)

        # Get methods in common
        method_ids: set[int] = set(int(i) for i in entities_df[entities_df["kind"].isin(["method", "constructor"])].index) # type: ignore
        my1_entity_ids = my1_clustering.entities()
        jd_entity_ids = jd_clustering.entities()
        commit_entity_ids = commit_clustering.entities()
        author_entity_ids = author_clustering.entities()
        common_method_ids = (
            method_ids
            & my1_entity_ids
            & jd_entity_ids
            & commit_entity_ids
            & author_entity_ids
        )

        if len(common_method_ids) < 2:
            continue

        if len(common_method_ids - ak_clustering.entities()) != 0:
            raise ValueError("AK is missing some methods")
        
        if len(common_method_ids - al2_clustering.entities()) != 0:
            raise ValueError("AL is missing some methods")

        # Calculate total number of blocks
        set_cell("dc_total_blocks", len(my2_clustering.expand().normalize().c2e.keys()))
        set_cell("jd_total_blocks", len(jd_clustering.expand().normalize().c2e.keys()))
        set_cell("ak_total_blocks", len(ak_clustering.expand().normalize().c2e.keys()))
        set_cell("al_total_blocks", len(al2_clustering.expand().normalize().c2e.keys()))

        set_cell("dc_height", max(len(c) for c in my2_clustering.normalize().clusters()))
        set_cell("jd_height", max(len(c) for c in jd_clustering.normalize().clusters()))
        set_cell("ak_height", max(len(c) for c in ak_clustering.normalize().clusters()))
        set_cell("al_height", max(len(c) for c in al2_clustering.normalize().clusters()))

        # For debug / diagram purposes
        # client_clustering = to_client_clustering(ds.clients_df, ds.client_deps_df).subset(common_method_ids)
        # my_clustering_with_files = to_my_clustering_with_files(entities_df).without_root()

        # Ensure all clusterings have the same methods
        my1_subclustering = my1_clustering.subset(common_method_ids).normalize()
        my2_subclustering = my2_clustering.subset(common_method_ids).normalize()
        jd_subclustering = jd_clustering.subset(common_method_ids).normalize()
        ak_subclustering = ak_clustering.subset(common_method_ids).normalize()
        al1_subclustering = al1_clustering.subset(common_method_ids).normalize()
        al2_subclustering = al2_clustering.subset(common_method_ids).normalize()
        al3_subclustering = al3_clustering.subset(common_method_ids).normalize()
        al4_subclustering = al4_clustering.subset(common_method_ids).normalize()
        commit_subclustering = commit_clustering.subset(common_method_ids)
        author_subclustering = author_clustering.subset(common_method_ids)

        # Normalize
        my1_subclustering = my1_subclustering.subset(common_method_ids).normalize()
        my2_subclustering = my2_subclustering.normalize()
        jd_subclustering = jd_subclustering.normalize()
        ak_subclustering = ak_subclustering.normalize()
        al1_subclustering = al1_subclustering.normalize()
        al2_subclustering = al2_subclustering.normalize()
        al3_subclustering = al3_subclustering.normalize()
        al4_subclustering = al4_subclustering.normalize()

        # Create matrices from each kind of clustering
        my1_arr = my1_subclustering.expand().ndarray()
        my2_arr = my2_subclustering.expand().ndarray()
        jd_arr = jd_subclustering.expand().ndarray()
        ak_arr = ak_subclustering.expand().ndarray()
        al1_arr = al1_subclustering.expand().ndarray()
        al2_arr = al2_subclustering.expand().ndarray()
        al3_arr = al3_subclustering.expand().ndarray()
        al4_arr = al4_subclustering.expand().ndarray()
        commit_arr = commit_subclustering.ndarray()
        author_arr = author_subclustering.ndarray()

        # Record the sizes of these matrices
        set_cell("comparable_methods", len(common_method_ids))
        # set_cell("comparable_commits", commit_arr.shape[0])
        # set_cell("comparable_authors", author_arr.shape[0])
        # set_cell("dc_comparable_clusters", my1_arr.shape[0])
        # set_cell("jd_comparable_clusters", jd_arr.shape[0])

        # Calculate the commit similarities
        set_cell("dc1_commit_sim", gri.grand_index(my1_arr, commit_arr))
        set_cell("dc2_commit_sim", gri.grand_index(my2_arr, commit_arr))
        set_cell("jd_commit_sim", gri.grand_index(jd_arr, commit_arr))
        set_cell("ak_commit_sim", gri.grand_index(ak_arr, commit_arr))
        set_cell("al1_commit_sim", gri.grand_index(al1_arr, commit_arr))
        set_cell("al2_commit_sim", gri.grand_index(al2_arr, commit_arr))
        set_cell("al3_commit_sim", gri.grand_index(al3_arr, commit_arr))
        set_cell("al4_commit_sim", gri.grand_index(al4_arr, commit_arr))

        # Calculate the author similarities
        set_cell("dc1_author_sim", gri.grand_index(my1_arr, author_arr))
        set_cell("dc2_author_sim", gri.grand_index(my2_arr, author_arr))
        set_cell("jd_author_sim", gri.grand_index(jd_arr, author_arr))
        set_cell("ak_author_sim", gri.grand_index(ak_arr, author_arr))
        set_cell("al1_author_sim", gri.grand_index(al1_arr, author_arr))
        set_cell("al2_author_sim", gri.grand_index(al2_arr, author_arr))
        set_cell("al3_author_sim", gri.grand_index(al3_arr, author_arr))
        set_cell("al4_author_sim", gri.grand_index(al4_arr, author_arr))

        with open(os.path.join(results_dir, f"{subject_id}.dv8-clsx.json"), "w") as f:
            name_map = {int(id): str(row["name"]) for id, row in entities_df.iterrows()} # type: ignore
            f.write(to_dv8_clx(f"{subject_id}-clsx", to_my_clustering_with_files(entities_df), name_map))

        with open(os.path.join(results_dir, f"{subject_id}.dv8-dsm.json"), "w") as f:
            f.write(to_dv8_dsm(f"{subject_id}-dsm", entities_df, ds.deps_df()))

        # Dump
        dump_indicators(
            entities_df.copy(),
            os.path.join(results_dir, f"{subject_id}.xlsx"),
            dc1_clx=my1_subclustering,
            dc2_clx=my2_subclustering,
            jd_clx=jd_subclustering,
            ak_clx=ak_subclustering,
            al1_clx=al1_subclustering,
            al2_clx=al2_subclustering,
            al3_clx=al3_subclustering,
            al4_clx=al4_subclustering,
            # my_clx=my1_clustering,
            # my_clx_without_singletons=my1_clustering.normalize(),
            # my_clx_final=my1_subclustering,
            # # my_clx_with_files = my_clustering_with_files,
            # jd_clx=jd_clustering,
            # jd_clx_without_singletons=jd_clustering.normalize(),
            # jd_clx_final=jd_subclustering,
            # ak_clx=ak_clustering,
            # ak_clx_without_singletons=ak_clustering.normalize(),
            # ak_clx_final=ak_subclustering,
            # al_clx=al_clustering,
            # al_clx_without_singletons=al_clustering.normalize(),
            # al_clx_final=al_subclustering,
            # commit_clx=commit_clustering,
            # author_clx=author_clustering,
            # client_clx=client_clustering
        )

    # print(sorted(set(res["project"])))

    # Unable to match more than two methods reported by JD to our data.
    # One cause of this is that JD will list members of nested classes as
    # members of the top level class. Wheras we only consider members
    # directly inside the top-level class.
    res = res[res["unmatched_jd_methods"] < 3]
    res["short_filename"] = [f.split("/")[-1] for f in res["filename"]] # type: ignore
    res.to_csv(os.path.join(results_dir, f"_summary{suffix}.csv"))


In [5]:
RESULTS_DIR = "results/"
DB_DIR = "data/"
JD_DIR = "data-jd/"

In [6]:
# Discover subjects
cdf = load_or_create("candidates", lambda: load_full_candidates_df(DB_DIR, JD_DIR))
subjects_df = cdf[(cdf["loc_pct"] >= 90) & (cdf["loc"] >= 500) & (cdf["commits"] >= 10) & (cdf["authors"] >= 2) & cdf["in_jdeo_data"]].copy()

In [7]:
# Drop some less important columns to produce an easier to read CSV
labels = [c for c in subjects_df.columns if c.endswith("_pct")] + ["in_jdeo_data"]
subjects_df.drop(labels, axis=1, inplace=True)

In [8]:
subjects_df

Unnamed: 0_level_0,project,commit_id,filename,loc,members,fan_in,commits,churn,authors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
pinot-11,pinot,8576,pinot-controller/src/main/java/org/apache/pino...,3633,212,110,210,29925,49
activemq-930,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,3170,409,1082,131,15429,26
logging-log4j2-2930,logging-log4j2,10412,log4j-api/src/main/java/org/apache/logging/log...,2848,474,112,148,39592,21
activemq-8854,activemq,9610,activemq-client/src/main/java/org/apache/activ...,2548,275,225,60,9620,14
activemq-571,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,2377,149,43,187,16608,22
...,...,...,...,...,...,...,...,...,...
activemq-55178,activemq,9610,activemq-client/src/main/java/org/apache/activ...,510,90,172,17,1564,5
dubbo-11605,dubbo,6257,dubbo-common/src/main/java/org/apache/dubbo/co...,506,103,88,29,1922,12
activemq-36012,activemq,9610,activemq-broker/src/main/java/org/apache/activ...,503,85,108,23,2248,12
logging-log4j2-8430,logging-log4j2,10412,log4j-api/src/main/java/org/apache/logging/log...,503,51,39,39,2266,15


In [9]:
# Get the result_dir
os.makedirs(RESULTS_DIR, exist_ok=True)
result_nums = [int(n.split("-")[1]) for n in os.listdir(RESULTS_DIR) if n.startswith("results-")]
result_num = max(result_nums, default=0) + 1
result_dir = os.path.join(RESULTS_DIR, f"results-{result_num}")

In [10]:
validate_subjects(subjects_df, DB_DIR, JD_DIR, result_dir, f"-{result_num}")

Working on Subject #1: (pinot, pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/PinotHelixResourceManager.java)
Loading dataset from cache...
Running Akash19...
Running Alzahrani20...
Working on Subject #2: (activemq, activemq-broker/src/main/java/org/apache/activemq/broker/BrokerService.java)
Loading dataset from cache...
Running Akash19...
Running Alzahrani20...
Working on Subject #3: (logging-log4j2, log4j-api/src/main/java/org/apache/logging/log4j/spi/AbstractLogger.java)
Loading dataset from cache...
Running Akash19...
Running Alzahrani20...
Working on Subject #4: (activemq, activemq-client/src/main/java/org/apache/activemq/ActiveMQConnection.java)
Loading dataset from cache...
Running Akash19...
Running Alzahrani20...
Working on Subject #5: (activemq, activemq-broker/src/main/java/org/apache/activemq/broker/region/Queue.java)
Loading dataset from cache...
Working on Subject #6: (activemq, activemq-client/src/main/java/org/apache/activemq/ActiveMQSession.java)