In [1]:
import sys
import os
import time
import sqlite3
import json
from pathlib import Path

import numpy as np
import pandas as pd
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db, dv8
from filesplitter.clustering import cluster_dataset
from filesplitter.loading import load_dataset
from filesplitter.validate import validate

In [2]:
DATA_DIR = Path("../data/")
RESULTS_DIR = Path("../results/")
MAX_SUBJECTS_PER_DB = 20

In [3]:
RESULTS_DIR.mkdir(exist_ok=True)

In [4]:
candidates_dfs = []
datasets = []

for db_name in list(sorted(os.listdir(DATA_DIR))):
    print(f"Finding subjects in {db_name}...")
    with sqlite3.connect(Path(DATA_DIR, db_name)) as con:
        db.create_temp_tables(con)
        ref_name = db.fetch_lead_ref_name(con)
        candidates = db.fetch_candidate_files(con, ref_name, 1000, 6)
        candidates.insert(0, "project", db_name.split(".")[0])
        candidates_dfs.append(candidates[:MAX_SUBJECTS_PER_DB])

subjects = pd.concat(candidates_dfs, ignore_index=True)
subject_names = ["{}__{}".format(p, "_".join(fn.split("/")[-2:])) for p, fn in zip(subjects["project"], subjects["filename"])]
subjects.insert(0, "subject_name", subject_names)
print(f"Loaded {len(subjects)} subjects.")

Finding subjects in android-base.db...
Finding subjects in android-settings.db...
Finding subjects in beam.db...
Finding subjects in deltaspike.db...
Finding subjects in dubbo.db...
Finding subjects in flume.db...
Finding subjects in gobblin.db...
Finding subjects in hbase.db...
Finding subjects in hudi.db...
Finding subjects in kafka.db...
Finding subjects in knox.db...
Finding subjects in nifi.db...
Finding subjects in oozie.db...
Loaded 125 subjects.


In [5]:
subject_names = []
entities_dfs = []
deps_dfs = []

for i, (_, row) in list(enumerate(subjects.iterrows()))[:]:
    print("Loading Subject {}: {}".format(i, row["subject_name"]))
    ds = load_dataset(Path(DATA_DIR, row["project"] + ".db"), row["filename"])
    entities_df = pd.read_csv(Path(RESULTS_DIR, row["subject_name"] + ".csv"), index_col="id")
    entities_dfs.append(entities_df)
    deps_dfs.append(ds.target_deps_df)
    subject_names.append(row["subject_name"])

Loading Subject 25: android-settings__wifi_WifiSettings.java
Loading Subject 26: android-settings__manageapplications_ManageApplications.java
Loading Subject 27: android-settings__fingerprint_FingerprintEnrollEnrolling.java
Loading Subject 28: android-settings__details2_WifiDetailPreferenceController2.java
Loading Subject 29: android-settings__fingerprint_FingerprintSettings.java


In [7]:
for subject_name, entities_df, deps_df in zip(subject_names, entities_dfs, deps_dfs):
    print("Writing {}...".format(subject_name))
    with open(Path(RESULTS_DIR, subject_name + ".dsm.json"), "w") as f:
        targets_df = entities_df.loc[~(entities_df["kind"] == "file")]
        json.dump(dv8.to_dsm(subject_name, targets_df, deps_df), f)
    with open(Path(RESULTS_DIR, subject_name + ".drh.json"), "w") as f:
        json.dump(dv8.to_drh(subject_name + "-drh", targets_df), f)

Writing android-settings__wifi_WifiSettings.java...
Writing android-settings__manageapplications_ManageApplications.java...
Writing android-settings__fingerprint_FingerprintEnrollEnrolling.java...
Writing android-settings__details2_WifiDetailPreferenceController2.java...
Writing android-settings__fingerprint_FingerprintSettings.java...


In [None]:
# entities_df = entities_dfs[0]
# targets_df = entities_df.loc[~(entities_df["kind"] == "file")].copy()


In [None]:
# targets_df

In [None]:
# drh = dv8.to_drh("my-drh", targets_df)

In [None]:
# with open("../example-drh.json", "w") as f:
#     json.dump(drh, f, indent=4)