In [1]:
import sys
import os
import time
import sqlite3
import json
from pathlib import Path

import numpy as np
import pandas as pd
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db, dv8
from filesplitter.clustering import cluster_dataset
from filesplitter.loading import load_dataset
from filesplitter.validate import validate

In [2]:
DATA_DIR = Path("../data/")
RESULTS_DIR = Path("../results/")
MAX_SUBJECTS_PER_DB = 20

In [3]:
RESULTS_DIR.mkdir(exist_ok=True)

In [4]:
candidates_dfs = []
datasets = []

for db_name in list(sorted(os.listdir(DATA_DIR))):
    print(f"Finding subjects in {db_name}...")
    with sqlite3.connect(Path(DATA_DIR, db_name)) as con:
        db.create_temp_tables(con)
        ref_name = db.fetch_lead_ref_name(con)
        candidates = db.fetch_candidate_files(con, ref_name, 1000, 6)
        candidates.insert(0, "project", db_name.split(".")[0])
        candidates_dfs.append(candidates[:MAX_SUBJECTS_PER_DB])

subjects = pd.concat(candidates_dfs, ignore_index=True)
subject_names = ["{}__{}".format(p, "_".join(fn.split("/")[-2:])) for p, fn in zip(subjects["project"], subjects["filename"])]
subjects.insert(0, "subject_name", subject_names)
print(f"Loaded {len(subjects)} subjects.")

Finding subjects in android-base.db...
Finding subjects in android-settings.db...
Finding subjects in beam.db...
Finding subjects in deltaspike.db...
Finding subjects in dubbo.db...
Finding subjects in flume.db...
Finding subjects in gobblin.db...
Finding subjects in hbase.db...
Finding subjects in hudi.db...
Finding subjects in kafka.db...
Finding subjects in knox.db...
Finding subjects in nifi.db...
Finding subjects in oozie.db...
Loaded 125 subjects.


In [5]:
subject_names = []
entities_dfs = []
deps_dfs = []

for i, (_, row) in list(enumerate(subjects.iterrows()))[:]:
    print("Loading Subject {}: {}".format(i, row["subject_name"]))
    ds = load_dataset(Path(DATA_DIR, row["project"] + ".db"), row["filename"])
    entities_df = pd.read_csv(Path(RESULTS_DIR, row["subject_name"] + ".csv"), index_col="id")
    entities_dfs.append(entities_df)
    deps_dfs.append(ds.target_deps_df)
    subject_names.append(row["subject_name"])

Loading Subject 0: android-base__view_View.java
Loading Subject 1: android-base__pm_PackageManager.java
Loading Subject 2: android-base__app_Activity.java
Loading Subject 3: android-base__view_ViewGroup.java
Loading Subject 4: android-base__widget_TextView.java
Loading Subject 5: android-base__telephony_TelephonyManager.java
Loading Subject 6: android-base__res_Resources.java
Loading Subject 7: android-base__provider_Settings.java
Loading Subject 8: android-base__os_BatteryStatsImpl.java
Loading Subject 9: android-base__admin_DevicePolicyManager.java
Loading Subject 10: android-base__devicepolicy_DevicePolicyManagerService.java
Loading Subject 11: android-base__pm_ApplicationInfo.java
Loading Subject 12: android-base__am_ActivityManagerService.java
Loading Subject 13: android-base__app_ActivityManager.java
Loading Subject 14: android-base__wm_ActivityRecord.java
Loading Subject 15: android-base__content_Intent.java
Loading Subject 16: android-base__view_KeyEvent.java
Loading Subject 17

In [6]:
for subject_name, entities_df, deps_df in zip(subject_names, entities_dfs, deps_dfs):
    print("Writing {}...".format(subject_name))
    with open(Path(RESULTS_DIR, subject_name + ".dsm.json"), "w") as f:
        targets_df = entities_df.loc[~(entities_df["kind"] == "file")]
        json.dump(dv8.to_dsm(subject_name, targets_df, deps_df), f)
    with open(Path(RESULTS_DIR, subject_name + ".drh.json"), "w") as f:
        json.dump(dv8.to_drh(subject_name + "-drh", targets_df), f)

Writing android-base__view_View.java...
Writing android-base__pm_PackageManager.java...
Writing android-base__app_Activity.java...
Writing android-base__view_ViewGroup.java...
Writing android-base__widget_TextView.java...
Writing android-base__telephony_TelephonyManager.java...
Writing android-base__res_Resources.java...
Writing android-base__provider_Settings.java...
Writing android-base__os_BatteryStatsImpl.java...
Writing android-base__admin_DevicePolicyManager.java...
Writing android-base__devicepolicy_DevicePolicyManagerService.java...
Writing android-base__pm_ApplicationInfo.java...
Writing android-base__am_ActivityManagerService.java...
Writing android-base__app_ActivityManager.java...
Writing android-base__wm_ActivityRecord.java...
Writing android-base__content_Intent.java...
Writing android-base__view_KeyEvent.java...
Writing android-base__media_AudioManager.java...
Writing android-base__view_ViewRootImpl.java...
Writing android-base__audio_AudioService.java...
Writing android

In [12]:
table = entities_dfs[0]
table

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row,name_id,strong_id,weak_id,block_name,block_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
236637,3012.0,DBG,field,841.0,842.0,41,0,0,W0ABAAAAABA,101
474442,3012.0,DEBUG_DRAW,field,845.0,845.0,45,1,0,W0BBAAB,168
474443,3012.0,VIEW_LOG_TAG,field,850.0,850.0,414,2,0,W0AABBABAAB,91
474444,3012.0,AUTOFILL_LOG_TAG,field,857.0,857.0,23,3,0,W0ABAAAAAAB,100
474445,3012.0,CONTENT_CAPTURE_LOG_TAG,field,862.0,862.0,36,4,0,W0ABABAAAAB,116
...,...,...,...,...,...,...,...,...,...,...
815810,,tests/appwidgets/AppWidgetHostTest/src/com/and...,file,,,3570,3510,0,W0AAABABBB,53
815827,,tests/backup/src/com/android/backuptest/Backup...,file,,,3571,3511,453,W453,567
815944,,tests/testables/src/android/testing/BaseFragme...,file,,,3573,3512,0,W0AAABAABABAABB,34
815998,,tests/testables/src/android/testing/LayoutInfl...,file,,,3574,3513,454,W454,568


In [7]:
# entities_df = entities_dfs[0]
# targets_df = entities_df.loc[~(entities_df["kind"] == "file")].copy()


In [8]:
# targets_df

In [9]:
# drh = dv8.to_drh("my-drh", targets_df)

In [10]:
# with open("../example-drh.json", "w") as f:
#     json.dump(drh, f, indent=4)