In [1]:
import sys
import time

import numpy as np
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import subjects, ilp
from filesplitter.graph import group_by_scc, group_by_wcc, group_edges_by

In [2]:
DS = subjects.load_subject(subjects.ANDROID_BASE_TEXT_VIEW)
entities_df = DS.entities_df()
deps_df = DS.deps_df()
target_deps_df = DS.target_deps_df
edges = oset((r["src_id"], r["tgt_id"]) for _, r in deps_df.iterrows())

In [3]:
# Create a "name_id" for each entity that groups targets according to their name
entities_df["name_id"] = entities_df.groupby("name").ngroup()

In [4]:
# Create a "strong_id" for each entity that groups targets according the strongly connected componant of their name
name_edges = group_edges_by(edges, entities_df["name_id"])
entities_df["strong_id"] = group_by_scc(entities_df["name_id"], name_edges)

In [5]:
# Create an "weak_id" for each entity that groups targets according the weakly connected componant of their strong_id
strong_edges = group_edges_by(edges, entities_df["strong_id"])
entities_df["weak_id"] = group_by_wcc(entities_df["strong_id"], strong_edges)

In [6]:
def get_entity_weight(id: int) -> int:
    kind = entities_df.loc[id]["kind"]
    return 0 if kind == "file" else 1

def get_strong_weight(strong_id: int) -> int:
    ids = entities_df[entities_df["strong_id"] == strong_id].index
    return sum(get_entity_weight(id) for id in ids)

def get_strong_loc(id: int) -> int:
    return entities_df[entities_df["strong_id"] == id]["loc"].sum()

In [7]:
# IDEA: The stopping criteria is based on density
# If `active` has too many `n_edges / n_nodes` then we stop
# Alternatively, if the `cut_weight / n_edges` or `cut_weight / n_nodes` is too high, then we stop

In [8]:
USE_ALL = True
EPS = 1 / 2
MAX_WEIGHT = 16

In [9]:
def cluster(edges: set[tuple[int, int]], active: set[int], name: str) -> dict[int, str] | None:
    active_edges = set((a, b) for a, b in edges if a in active and b in active)
    
    density = len(active_edges) / len(active)
    timestamp = time.strftime("%H:%M:%S", time.localtime())
    prefix = f"[{name}]".ljust(18) + f" ({timestamp})   "
    info = f"{len(active_edges)} edges and {len(active)} nodes = {density:0.4f} density"
    print(prefix + f"Starting... ({info})", end="\t")

    default_res = {i: name for i in active}

    if sum(get_strong_weight(strong_id) for strong_id in active) <= MAX_WEIGHT:
        print("Aborted. Weight under threshold.")
        return default_res

    def w(strong_id: int) -> int:
        if strong_id not in active:
            return 0
        return get_strong_weight(strong_id)

    # There are two ways to use `active`:
    # 1) Use ILP to bisect only the active elements
    #    - This might be faster.
    # 2) Use ILP to bisect all elements, but non-active elements are weighted to 0
    #    - This might produce better results.
    if USE_ALL:
        active_edges = edges

    start = time.perf_counter()
    cut_weight, labels = ilp.partition(list(active_edges), w, lambda i, j: 1, 2, EPS)
    if labels is None:
        print("Aborted. Failed to partition.")
        return default_res
    elapsed = time.perf_counter() - start
    print(f"Bisected with a cut weight of {cut_weight} in {elapsed:0.4f} secs.")

    active_A = active & {i for i, l in labels.items() if l == 0}
    active_B = active & {i for i, l in labels.items() if l == 1}
    res_A = cluster(edges, active_A, name + "A")
    res_B = cluster(edges, active_B, name + "B")
    return res_A | res_B


In [10]:
block_names = {}

for weak_id in range(entities_df["weak_id"].max() + 1):
    # The strong_ids inside the current weakly connected component (wcc)
    wcc_nodes = set(entities_df[entities_df["weak_id"] == weak_id]["strong_id"])
    wcc_edges = {(a, b) for a, b in strong_edges if a in wcc_nodes and b in wcc_nodes}
    block_names |= cluster(wcc_edges, wcc_nodes, name=f"W{weak_id}")

entities_df["block_name"] = [block_names.get(i) for i in entities_df["strong_id"]]
entities_df["block_id"] = entities_df.groupby("block_name").ngroup()

[W0]               (18:16:26)   Starting... (3814 edges and 1481 nodes = 2.5753 density)	Bisected with a cut weight of 130.0 in 6.1052 secs.
[W0A]              (18:16:32)   Starting... (3622 edges and 1308 nodes = 2.7691 density)	Bisected with a cut weight of 193.0 in 8.0061 secs.
[W0AA]             (18:16:40)   Starting... (3357 edges and 1188 nodes = 2.8258 density)	Bisected with a cut weight of 193.0 in 6.2222 secs.
[W0AAA]            (18:16:46)   Starting... (3009 edges and 1082 nodes = 2.7810 density)	Bisected with a cut weight of 205.0 in 7.4903 secs.
[W0AAAA]           (18:16:54)   Starting... (2790 edges and 1009 nodes = 2.7651 density)	Bisected with a cut weight of 250.0 in 14.2301 secs.
[W0AAAAA]          (18:17:08)   Starting... (2534 edges and 953 nodes = 2.6590 density)	Bisected with a cut weight of 111.0 in 2.8147 secs.
[W0AAAAAA]         (18:17:11)   Starting... (18 edges and 41 nodes = 0.4390 density)	Bisected with a cut weight of 5.0 in 0.5902 secs.
[W0AAAAAAA]        

## Validation

In [11]:
from collections import defaultdict
from random import shuffle

In [18]:
def count_blocks_touched(partition: dict[int, int], user_touches: set[int]) -> int:
    return len({partition[id] for id in user_touches})

def avg_blocks_touched_by_user(partition: dict[int, int], touches: dict[str, set[int]]) -> float:
    return np.average([count_blocks_touched(partition, t) for _, t in touches.items()])

def get_sizes(partition: dict[int, int]) -> list[int]:
    inverted = defaultdict(set)
    for entity, block in partition.items():
        inverted[block].add(entity)
    return list(sorted((len(x) for x in inverted.values()), reverse=True))

def rand_partition(sizes: list[int], entities: set[int]) -> dict[int, int]:
    rand_order = list(entities)
    shuffle(rand_order)
    partition = {}
    curr = 0
    for block, size in enumerate(sizes):
        for entity in rand_order[curr:curr+size]:
            partition[entity] = block
        curr += size
    return partition

In [19]:
targets_df = entities_df.loc[~(entities_df["kind"] == "file")].copy()
partition = {k: v for k,v in targets_df["block_id"].items()}

In [20]:
touches = defaultdict(set)
for _, row in DS.touches_df.iterrows():
    touches[row["author_email"]].add(row["entity_id"])

In [21]:
avg_blocks_touched_by_user(partition, touches)

3.8533333333333335

In [22]:
sizes = get_sizes(partition)
entities_set = set(partition.keys())

In [23]:
trails = [avg_blocks_touched_by_user(rand_partition(sizes, entities_set), touches) for _ in range(5_000)]
print(np.average(trails))

4.822253333333333
