In [1]:
import sys
import os
import time
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db
from filesplitter.clustering import cluster_dataset
from filesplitter.loading import load_dataset
from filesplitter.validate import validate

In [2]:
DATA_DIR = Path("../data/")
MAX_SUBJECTS_PER_DB = 2

In [3]:
candidates_dfs = []

for db_name in list(sorted(os.listdir(DATA_DIR)))[1:]:
    print(f"Finding subjects in {db_name}...")
    with sqlite3.connect(Path(DATA_DIR, db_name)) as con:
        db.create_temp_tables(con)
        ref_name = db.fetch_lead_ref_name(con)
        candidates = db.fetch_candidate_files(con, ref_name, 1000, 6)
        candidates.insert(0, "project", db_name.split(".")[0])
        candidates_dfs.append(candidates[:MAX_SUBJECTS_PER_DB])

subjects = pd.concat(candidates_dfs, ignore_index=True)

Finding subjects in android-settings.db...
Finding subjects in beam.db...
Finding subjects in deltaspike.db...
Finding subjects in dubbo.db...
Finding subjects in flume.db...
Finding subjects in gobblin.db...
Finding subjects in hbase.db...
Finding subjects in hudi.db...
Finding subjects in kafka.db...
Finding subjects in knox.db...
Finding subjects in nifi.db...
Finding subjects in oozie.db...


In [4]:
subject_names = ["{}__{}".format(p, "_".join(fn.split("/")[-2:])) for p, fn in zip(subjects["project"], subjects["filename"])]
subjects.insert(0, "subject_name", subject_names)

In [5]:
print(f"# of subjects: {len(subjects)}")

# of subjects: 18


In [6]:
os.makedirs("../results", exist_ok=True)

In [7]:
n_blocks = []
real_ABPAs = []
null_ABPAs = []

for i, (_, row) in enumerate(subjects.iterrows()):
    print("Working on Subject {}: {}".format(i, row["subject_name"]))
    ds = load_dataset(Path(DATA_DIR, row["project"] + ".db"), row["filename"])
    entities_df = cluster_dataset(ds)
    entities_df.to_csv("../results/{}.csv".format(row["subject_name"]))
    n_blocks.append(entities_df.groupby("block_name").ngroups)
    real_ABPA, null_ABPA = validate(entities_df, ds.touches_df)
    real_ABPAs.append(real_ABPA)
    null_ABPAs.append(null_ABPA)

subjects["n_blocks"] = n_blocks
subjects["real_ABPA"] = real_ABPAs
subjects["null_ABPA"] = null_ABPAs
subjects["real_ABPA_ratio"] = subjects["real_ABPA"] / subjects["n_blocks"]
subjects["null_ABPA_ratio"] = subjects["null_ABPA"] / subjects["n_blocks"]

Working on Subject 0: android-settings__settings_Utils.java
[W0]               (20:14:48)   Starting... (272 edges and 237 nodes = 1.1477 density)	Bisected with a cut weight of 4.0 in 0.1442 secs.
[W0A]              (20:14:48)   Starting... (98 edges and 78 nodes = 1.2564 density)	Aborted. Weight under threshold.
[W0B]              (20:14:48)   Starting... (170 edges and 159 nodes = 1.0692 density)	Bisected with a cut weight of 4.0 in 0.0991 secs.
[W0BA]             (20:14:48)   Starting... (141 edges and 131 nodes = 1.0763 density)	Bisected with a cut weight of 4.0 in 0.0907 secs.
[W0BAA]            (20:14:48)   Starting... (16 edges and 15 nodes = 1.0667 density)	Aborted. Weight under threshold.
[W0BAB]            (20:14:48)   Starting... (121 edges and 116 nodes = 1.0431 density)	Bisected with a cut weight of 3.0 in 0.0777 secs.
[W0BABA]           (20:14:48)   Starting... (110 edges and 105 nodes = 1.0476 density)	Bisected with a cut weight of 3.0 in 0.0728 secs.
[W0BABAA]          

In [8]:
subjects.to_csv("../results/_summary.csv")