In [1]:
import sys
import os
import time
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db
from filesplitter.clustering import cluster_dataset
from filesplitter.loading import load_dataset
from filesplitter.validate import validate

In [2]:
DATA_DIR = Path("../data/")
MAX_SUBJECTS_PER_DB = 20

In [3]:
candidates_dfs = []

for db_name in list(sorted(os.listdir(DATA_DIR))):
    print(f"Finding subjects in {db_name}...")
    with sqlite3.connect(Path(DATA_DIR, db_name)) as con:
        db.create_temp_tables(con)
        ref_name = db.fetch_lead_ref_name(con)
        candidates = db.fetch_candidate_files(con, ref_name, 1000, 6)
        candidates.insert(0, "project", db_name.split(".")[0])
        candidates_dfs.append(candidates[:MAX_SUBJECTS_PER_DB])

subjects = pd.concat(candidates_dfs, ignore_index=True)

Finding subjects in android-base.db...
Finding subjects in android-settings.db...
Finding subjects in beam.db...
Finding subjects in deltaspike.db...
Finding subjects in dubbo.db...
Finding subjects in flume.db...
Finding subjects in gobblin.db...
Finding subjects in hbase.db...
Finding subjects in hudi.db...
Finding subjects in kafka.db...
Finding subjects in knox.db...
Finding subjects in nifi.db...
Finding subjects in oozie.db...


In [4]:
subject_names = ["{}__{}".format(p, "_".join(fn.split("/")[-2:])) for p, fn in zip(subjects["project"], subjects["filename"])]
subjects.insert(0, "subject_name", subject_names)

In [5]:
print(f"# of subjects: {len(subjects)}")

# of subjects: 125


In [6]:
os.makedirs("../results", exist_ok=True)

In [7]:
n_blocks = []
real_ABPAs = []
null_ABPAs = []

for i, (_, row) in enumerate(subjects.iterrows()):
    print("Working on Subject {}: {}".format(i, row["subject_name"]))
    ds = load_dataset(Path(DATA_DIR, row["project"] + ".db"), row["filename"])
    entities_df = cluster_dataset(ds)
    entities_df.to_csv("../results/{}.csv".format(row["subject_name"]))
    n_blocks.append(entities_df.groupby("block_name").ngroups)
    real_ABPA, null_ABPA = validate(entities_df, ds.touches_df)
    real_ABPAs.append(real_ABPA)
    null_ABPAs.append(null_ABPA)

subjects["n_blocks"] = n_blocks
subjects["real_ABPA"] = real_ABPAs
subjects["null_ABPA"] = null_ABPAs
subjects["real_ABPA_ratio"] = subjects["real_ABPA"] / subjects["n_blocks"]
subjects["null_ABPA_ratio"] = subjects["null_ABPA"] / subjects["n_blocks"]

Working on Subject 0: android-base__view_View.java
[W0]               (21:52:46)   Starting... (14296 edges and 3060 nodes = 4.6719 density)	Bisected with a cut weight of 518.0 in 10.2026 secs.
[W0A]              (21:52:57)   Starting... (13645 edges and 2662 nodes = 5.1258 density)	Bisected with a cut weight of 1009.0 in 16.9051 secs.
[W0AA]             (21:53:14)   Starting... (12454 edges and 2367 nodes = 5.2615 density)	Bisected with a cut weight of 1571.0 in 32.1362 secs.
[W0AAA]            (21:53:47)   Starting... (10855 edges and 2125 nodes = 5.1082 density)	Bisected with a cut weight of 2033.0 in 26.6434 secs.
[W0AAAA]           (21:54:14)   Starting... (249 edges and 245 nodes = 1.0163 density)	Bisected with a cut weight of 58.0 in 2.7261 secs.
[W0AAAAA]          (21:54:16)   Starting... (6 edges and 40 nodes = 0.1500 density)	Bisected with a cut weight of 10.0 in 2.5290 secs.
[W0AAAAAA]         (21:54:19)   Starting... (0 edges and 10 nodes = 0.0000 density)	Aborted. Weight u

In [8]:
subjects.to_csv("../results/_summary.csv")