In [1]:
import sys
import time

from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import subjects, ilp
from filesplitter.graph import group_by_scc, group_by_wcc, group_edges_by

In [2]:
DS = subjects.load_subject(subjects.ANDROID_BASE_TEXT_VIEW)
entities_df = DS.entities_df()
deps_df = DS.deps_df()
target_deps_df = DS.target_deps_df
edges = oset((r["src_id"], r["tgt_id"]) for _, r in deps_df.iterrows())

In [3]:
# Create a "name_id" for each entity that groups targets according to their name
entities_df["name_id"] = entities_df.groupby("name").ngroup()

In [4]:
# Create a "strong_id" for each entity that groups targets according the strongly connected componant of their name
name_edges = group_edges_by(edges, entities_df["name_id"])
entities_df["strong_id"] = group_by_scc(entities_df["name_id"], name_edges)

In [5]:
# Create an "weak_id" for each entity that groups targets according the weakly connected componant of their strong_id
strong_edges = group_edges_by(edges, entities_df["strong_id"])
entities_df["weak_id"] = group_by_wcc(entities_df["strong_id"], strong_edges)

In [6]:
def get_entity_weight(id: int) -> int:
    kind = entities_df.loc[id]["kind"]
    return 0 if kind == "file" else 1

def get_strong_weight(strong_id: int) -> int:
    ids = entities_df[entities_df["strong_id"] == strong_id].index
    return sum(get_entity_weight(id) for id in ids)

def get_strong_loc(id: int) -> int:
    return entities_df[entities_df["strong_id"] == id]["loc"].sum()

In [7]:
# IDEA: The stopping criteria is based on density
# If `active` has too many `n_edges / n_nodes` then we stop
# Alternatively, if the `cut_weight / n_edges` or `cut_weight / n_nodes` is too high, then we stop

In [8]:
USE_ALL = True
EPS = 1 / 2
MAX_WEIGHT = 16

In [9]:
def cluster(edges: set[tuple[int, int]], active: set[int], name: str) -> dict[int, str]:
    timestamp = time.strftime("%H:%M:%S", time.localtime())
    prefix = f"[{name}]".ljust(18) + f" ({timestamp})   "
    print(prefix + f"Starting on {len(active)} active elements...", end="\t")

    default_res = {i: name for i in active}

    if sum(get_strong_weight(strong_id) for strong_id in active) <= MAX_WEIGHT:
        print("Aborted. Weight under threshold.")
        return default_res

    def w(strong_id: int) -> int:
        if strong_id not in active:
            return 0
        return get_strong_weight(strong_id)

    # There are two ways to use `active`:
    # 1) Use ILP to bisect only the active elements
    #    - This might be faster.
    # 2) Use ILP to bisect all elements, but non-active elements are weighted to 0
    #    - This might produce better results.
    if USE_ALL:
        active_edges = edges
    else:
        active_edges = set((a, b) for a, b in edges if a in active and b in active)

    start = time.perf_counter()
    cut_weight, labels = ilp.partition(list(active_edges), w, lambda i, j: 1, 2, EPS)
    if labels is None:
        print("Aborted. Failed to partition.")
        return default_res
    elapsed = time.perf_counter() - start
    print(f"Bisected with a cut weight of {cut_weight} in {elapsed:0.4f} secs.")

    active_A = active & {i for i, l in labels.items() if l == 0}
    active_B = active & {i for i, l in labels.items() if l == 1}
    res_A = cluster(edges, active_A, name + "A")
    res_B = cluster(edges, active_B, name + "B")
    return res_A | res_B


In [10]:
part_names = {}

for weak_id in range(entities_df["weak_id"].max() + 1):
    # The strong_ids inside the current weakly connected component (wcc)
    wcc_nodes = set(entities_df[entities_df["weak_id"] == weak_id]["strong_id"])
    wcc_edges = {(a, b) for a, b in strong_edges if a in wcc_nodes and b in wcc_nodes}
    part_names |= cluster(wcc_edges, wcc_nodes, name=f"W{weak_id}")

entities_df["part_name"] = [part_names.get(i) for i in entities_df["strong_id"]]
entities_df["part_id"] = entities_df.groupby("part_name").ngroup()

[W0]               (13:13:35)   Starting on 1231 active elements...	Bisected with a cut weight of 214.0 in 4.0742 secs.
[W0A]              (13:13:40)   Starting on 1055 active elements...	Bisected with a cut weight of 306.0 in 4.6678 secs.
[W0AA]             (13:13:44)   Starting on 915 active elements...	Bisected with a cut weight of 331.0 in 3.4221 secs.
[W0AAA]            (13:13:48)   Starting on 806 active elements...	Bisected with a cut weight of 316.0 in 2.9201 secs.
[W0AAAA]           (13:13:51)   Starting on 95 active elements...	Bisected with a cut weight of 22.0 in 0.5183 secs.
[W0AAAAA]          (13:13:52)   Starting on 18 active elements...	Bisected with a cut weight of 4.0 in 0.4543 secs.
[W0AAAAAA]         (13:13:52)   Starting on 4 active elements...	Aborted. Weight under threshold.
[W0AAAAAB]         (13:13:52)   Starting on 14 active elements...	Aborted. Weight under threshold.
[W0AAAAB]          (13:13:52)   Starting on 77 active elements...	Bisected with a cut weight