In [1]:
import sqlite3

import numpy as np
import pandas as pd
from numpy import linalg
from sklearn.cluster import SpectralClustering

import db

In [2]:
Df = pd.DataFrame

In [3]:
DB_FILE = "android-base.db"
LEAD_REF = "refs/tags/android-13.0.0_r24"
TARGET_FILE = "core/java/android/view/View.java"
TARGET_ID = 102495 # The id of the top-level class of the target file

In [4]:
CON = sqlite3.connect(DB_FILE)

In [5]:
db.create_temp_tables(CON)
targets_df = db.fetch_children(CON, LEAD_REF, TARGET_ID)
internal_deps_df = db.fetch_internal_deps(CON, TARGET_ID)
clients_df = db.fetch_clients(CON, TARGET_FILE)
client_deps_df = db.fetch_client_deps(CON, TARGET_ID, TARGET_FILE)

In [6]:
internal_deps_df = internal_deps_df[(internal_deps_df["kind"] == "Call") | (internal_deps_df["kind"] == "Use")]
client_deps_df = client_deps_df[(internal_deps_df["kind"] == "Call") | (client_deps_df["kind"] == "Use")]

In [7]:
entities_df = pd.concat([targets_df, clients_df])
deps_df = pd.concat([internal_deps_df, client_deps_df])

# entities_df = pd.concat([targets_df])
# deps_df = pd.concat([internal_deps_df])

In [8]:
entity_ids = list(entities_df.index)

In [9]:
def to_entity_id(ix: int) -> int:
    return entity_ids[ix]

def to_entity_ix(id: int) -> int:
    return entity_ids.index(id)

In [10]:
def create_sym_mat(n_entities, deps_df: Df) -> np.ndarray:
    arr = np.zeros((n_entities, n_entities))
    for _, row in deps_df.iterrows():
        src_ix = to_entity_ix(row["src_id"])
        tgt_ix = to_entity_ix(row["tgt_id"])
        arr[(src_ix, tgt_ix)] = 1.0
        arr[(tgt_ix, src_ix)] = 1.0
    return arr

In [11]:
M = create_sym_mat(len(entities_df), deps_df)
D = np.diag(np.sum(M, axis=0))
L = D - M
L_inv = linalg.pinv(L, hermitian=True)
S = L_inv[0:len(targets_df), 0:len(targets_df)]

In [12]:
# arb_id = 240
# entities_df.iloc[arb_id]
# df = entities_df.copy()
# df["sim"] = L_inv[arb_id]
# df.sort_values(by="sim", ascending=False).head(15)

In [34]:
len(targets_df) / 10

162.5

In [39]:
clustering = SpectralClustering(n_clusters=80, assign_labels="cluster_qr", affinity="precomputed", random_state=0).fit(S.clip(min=0))

In [40]:
targets_df["cluster"] = clustering.labels_

In [41]:
targets_df[targets_df["cluster"] == 20].head(40)

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102597,102495,DRAWING_CACHE_QUALITY_LOW,field,1708,1709,20
102598,102495,DRAWING_CACHE_QUALITY_HIGH,field,1727,1728,20
102599,102495,DRAWING_CACHE_QUALITY_AUTO,field,1746,1747,20
102696,102495,PFLAG_DRAWING_CACHE_VALID,field,2525,2525,20
102984,102495,mDrawingCacheBackgroundColor,field,4989,4989,20
103010,102495,mDrawingCache,field,5209,5210,20
103011,102495,mUnscaledDrawingCache,field,5211,5212,20
103667,102495,computeScroll,method,19839,19840,20
103774,102495,destroyDrawingCache,method,22189,22199,20
103775,102495,setDrawingCacheBackgroundColor,method,22225,22231,20


In [17]:
targets_df[targets_df["cluster"] == 10].head(40)

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102690,102495,PFLAG_REQUEST_TRANSPARENT_REGIONS,field,2512,2512,10
102740,102495,TEXT_DIRECTION_RESOLVED_DEFAULT,field,2858,2858,10
102757,102495,TEXT_ALIGNMENT_RESOLVED_DEFAULT,field,2989,2989,10
102875,102495,SYSTEM_UI_CLEARABLE_FLAGS,field,4057,4059,10
102882,102495,SCREEN_STATE_OFF,field,4120,4120,10
102883,102495,SCREEN_STATE_ON,field,4127,4127,10
102888,102495,mParent,field,4158,4159,10
103034,102495,mUnbufferedInputSource,field,5333,5334,10
103364,102495,focusSearch,method,13244,13250,10
103369,102495,setFocusedInCluster,method,13310,13327,10


In [18]:
targets_df

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102496,102495,DBG,field,841,842,132
102497,102495,DEBUG_DRAW,field,845,845,3
102498,102495,VIEW_LOG_TAG,field,850,850,67
102499,102495,AUTOFILL_LOG_TAG,field,857,857,98
102500,102495,CONTENT_CAPTURE_LOG_TAG,field,862,862,146
...,...,...,...,...,...,...
104116,102495,onVirtualViewTranslationResponses,method,31657,31660,0
104117,102495,dispatchCreateViewTranslationRequest,method,31689,31706,149
104118,102495,ViewTranslationRequestConsumer,class,31708,31732,149
104119,102495,generateDisplayHash,method,31748,31798,69


In [19]:
internal_deps_df

Unnamed: 0,src_id,tgt_id,kind
16,103038,102507,Use
17,103038,102508,Use
18,103038,102509,Use
19,103038,102510,Use
20,103038,102511,Use
...,...,...,...
4019,104119,102498,Use
4020,104119,103622,Call
4021,104119,103742,Call
4022,104119,103743,Call


In [24]:
with open("example.dot", "w") as f:
    f.write("digraph {\n")
    f.write("\trankdir=\"LR\";\n")
    for _, row in internal_deps_df.iterrows():
        src_ix = to_entity_ix(row["src_id"])
        tgt_ix = to_entity_ix(row["tgt_id"])
        src_name = targets_df.iloc[src_ix]["name"]
        tgt_name = targets_df.iloc[tgt_ix]["name"]
        f.write("\t{} -> {}\n".format(src_name, tgt_name))
    f.write("}\n")