In [1]:
import sys
import os
import time
import sqlite3
import math
from pathlib import Path
from itertools import chain
from collections import Counter
from typing import Callable

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import DBSCAN
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db, subjects, naming
from filesplitter.clustering import cluster_dataset, to_name_cluster_labels
from filesplitter.loading import load_dataset
from filesplitter.validate import validate
from filesplitter.graph import group_by_scc, group_by_wcc, group_edges_by
from filesplitter.naming import NameSimilarity

First, define some basic notation.

- Let $D$ be the set of documents (identifiers) and let $T$ be the set of terms.

- Let $S \subseteq T \times D$ be the set of term-document occurrences (or _sample points_.) If `allow_dup_docs = True`, then $S$ is a multi-set.

Next, define some useful subsets.

- Let $S_i = \{s : s = (t,d) \in S \text{ and } t=t_i\}$ be the subset of occurrences that contain term $t_i$.

- Let $S_j = \{s : s = (t,d) \in S \text{ and } d=d_j\}$ be the subset of occurrences that contain document $d_j$.

- Let $S_{ij} = \{s : s = (t,d) \in S \text{ and } t=t_i \text{ and } d=d_j\}$ be the subset of occurrences that contain both term $t_i$ and document $d_j$.

And some useful shorthand for refering the size of these subsets.

- Let $c = |S|$ be the number of term-document pairs.

- Let $c_i = |S_i|$ be the number of documents that contain term $t_i$.

- Let $c_j = |S_j|$ be the number of terms that are contained within document $d_j$.

- Let $c_{ij} = |S_{ij}|$ be the number of times term $t_i$ is used by document $d_j$.

Now define some random variables.

- Let $X_i(s)$ for some $s = (t, d) \in S$ be the indicator random variable $\mathbf{1}_{S_i}$.

- Let $Y_j(s)$ for some $s = (t, d) \in S$ be the indicator random variable $\mathbf{1}_{S_j}$.

If we assume each sample point $s \in S$ is equiprobable, we can find these joint probability distributions.

- $P(X_i = 1; Y_j = 1) = p_{ij}(1,1) = c_{ij}/c$

- $P(X_i = 1; Y_j = 0) = p_{ij}(1,0) = (c_i - c_{ij}) / c$

- $P(X_i = 0; Y_j = 1) = p_{ij}(0,1) = (c_j - c_{ij}) / c$

- $P(X_i = 0; Y_j = 0) = p_{ij}(0,0) = (c + c_{ij} - c_i - c_j) / c$

And then the marginals.

- $P(X_i = 1) = p_{i}(1) = c_i / c$

- $P(X_i = 0) = p_{i}(0) = 1 - c_i / c = (c - c_i) / c$

- $P(Y_j = 1) = p_{j}(1) = c_j / c$

- $P(Y_j = 0) = p_{j}(0) = 1 - c_j / c = (c - c_j) / c$

Now we can derive the mutual information between any $X_i$ and $Y_j$.

\begin{align*}
I(X_i;Y_j)
&=
\sum_{y \in \{0,1\}}\sum_{x \in \{0,1\}}
P(X_i = x; Y_j = y)
\log{\left(\frac{P(X_i = x; Y_j = y)}{P(X_i = x)P(Y_j = y)}\right)} \\

&=
\sum_{y \in \{0,1\}}\sum_{x \in \{0,1\}}
p_{ij}(x,y)
\log{\left(\frac{p_{ij}(x,y)}{p_i(x)p_j(y)}\right)} \\

&=
p_{ij}(1,1)
\log{\left(\frac{p_{ij}(1,1)}{p_i(1)p_j(1)}\right)}
+
p_{ij}(1,0)
\log{\left(\frac{p_{ij}(1,0)}{p_i(1)p_j(0)}\right)}
+
p_{ij}(0,1)
\log{\left(\frac{p_{ij}(0,1)}{p_i(0)p_j(1)}\right)}
+
p_{ij}(0,0)
\log{\left(\frac{p_{ij}(0,0)}{p_i(0)p_j(0)}\right)} \\

&=
(c_{ij}/c)
\log{\left(\frac{c_{ij}/c}{(c_i / c)(c_j / c)}\right)}
+
((c_i - c_{ij}) / c)
\log{\left(\frac{(c_i - c_{ij}) / c}{(c_i / c)((c - c_j) / c)}\right)}
+
((c_j - c_{ij}) / c)
\log{\left(\frac{(c_j - c_{ij}) / c}{((c - c_i) / c)(c_j / c)}\right)}
+
((c + c_{ij} - c_i - c_j) / c)
\log{\left(\frac{(c + c_{ij} - c_i - c_j) / c}{((c - c_i) / c)((c - c_j) / c)}\right)} \\

&=
c^{-1}\left(
c_{ij}\ln\left(\frac{cc_{ij}}{c_ic_j}\right)
+
(c_i-c_{ij})\ln\left(\frac{c(c_i - c_{ij})}{c_i(c - c_j)}\right)
+
(c_j-c_{ij})\ln\left(\frac{c(c_j - c_{ij})}{c_j(c - c_i)}\right)
+
(c + c_{ij} - c_i - c_j)\ln\left(\frac{c(c+c_{ij}-c_{i}-c_{j})}{(c-c_{i})(c-c{j})}\right)
\right)
\end{align*}

In [2]:
DS = subjects.load_subject(subjects.ANDROID_BASE_TEXT_VIEW)

In [3]:
# ...
entities_df = DS.entities_df()
edges = oset((r["src_id"], r["tgt_id"]) for _, r in DS.deps_df().iterrows())

# Calculate similarities
similarity = naming.NameSimilarity(list(DS.targets_df["name"]), allow_dup_names=True)

# Create a "name_id" for each entity that groups targets according to their name
entities_df["name_id"] = entities_df.groupby("name").ngroup()

# Create a "strong_id" for each entity that groups targets according the strongly connected componant of their name
name_edges = group_edges_by(edges, entities_df["name_id"])
entities_df["strong_id"] = group_by_scc(entities_df["name_id"], name_edges)

# Create a "weak_id" for each entity that groups targets according the weakly connected componant of their strong_id
strong_edges = group_edges_by(edges, entities_df["strong_id"])
entities_df["weak_id"] = group_by_wcc(entities_df["strong_id"], strong_edges)

In [4]:
similarity.sim("LOG_TAG", "logCursor")

0.5367276396126864

In [5]:
similarity.most_sim("getCompoundPaddingTop", 10)

[('top_pad_offset', 0.7100504102581341),
 ('total_pad_top', 0.6447012565212688),
 ('extend_pad_top', 0.6131523086266681),
 ('compound_drawabl_pad', 0.5781845830131089),
 ('fade_top', 0.5429261623597115),
 ('compound_pad_start', 0.48466408946957457),
 ('compound_pad_right', 0.47707748489327506),
 ('compound_pad_left', 0.47707748489327473),
 ('pad', 0.46324654398322196)]

In [6]:
entities_df[entities_df["strong_id"] == 323]

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row,name_id,strong_id,weak_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
492722,1323,TextView,constructor,1057.0,1059.0,74,323,0
492723,1323,TextView,constructor,1061.0,1063.0,74,323,0
492724,1323,TextView,constructor,1065.0,1067.0,74,323,0
21272,1323,TextView,constructor,1069.0,1865.0,74,323,0
492760,1323,setKeyListenerOnly,method,2611.0,2623.0,1102,323,0
492762,1323,setMovementMethod,method,2647.0,2661.0,1121,323,0
492765,1323,setTransformationMethod,method,2694.0,2726.0,1157,323,0
254075,1323,applyTextAppearance,method,4220.0,4286.0,93,323,0
492910,1323,setText,method,6404.0,6407.0,1139,323,0
287346,1323,setText,method,6453.0,6458.0,1139,323,0


In [7]:
def group_sim(sim: NameSimilarity, a_names: list[str], b_names: list[str]) -> list[float]:
    weights = []
    for a_name in a_names:
        if not sim.has_doc(a_name):
            continue
        for b_name in b_names:
            if not sim.has_doc(b_name):
                continue
            weights.append(sim.sim(a_name, b_name))
    return weights

def min_group_sim(sim: NameSimilarity, a_names: list[str], b_names: list[str]):
    weights = group_sim(sim, a_names, b_names)
    return np.min(weights) if len(weights) != 0 else 0

def avg_group_sim(sim: NameSimilarity, a_names: list[str], b_names: list[str]):
    weights = group_sim(sim, a_names, b_names)
    return np.average(weights) if len(weights) != 0 else 0

def max_group_sim(sim: NameSimilarity, a_names: list[str], b_names: list[str]):
    weights = group_sim(sim, a_names, b_names)
    return np.max(weights) if len(weights) != 0 else 0

In [8]:
def get_names(strong_id: int) -> list[str]:
    return list(entities_df[entities_df["strong_id"] == strong_id]["name"])

In [9]:
get_names(323)

['TextView',
 'TextView',
 'TextView',
 'TextView',
 'setKeyListenerOnly',
 'setMovementMethod',
 'setTransformationMethod',
 'applyTextAppearance',
 'setText',
 'setText',
 'setText',
 'setText',
 'setText',
 'setText',
 'setInputType',
 'setInputType',
 'setTextIsSelectable',
 'applySingleLine',
 'sendAfterTextChanged',
 'notifyListeningManagersAfterTextChanged']

In [10]:
# MIN_CORRELATION = 0.3
# GROUP_SIM_FN = avg_group_sim

# un_edges = {}

# strong_ids = list(entities_df["strong_id"])

# for a_ix in range(len(strong_ids)):
#     a_names = get_names(strong_ids[a_ix])
#     for b_ix in range(a_ix + 1, len(strong_ids)):
#         b_names = get_names(strong_ids[b_ix])
#         score = GROUP_SIM_FN(similarity, a_names, b_names)
#         if score >= MIN_CORRELATION:
#             un_edges[(a_ix, b_ix)] = un_edges[(b_ix, a_ix)] = score

In [18]:
MIN_CORRELATION = 0.4
GROUP_SIM_FN = max_group_sim

In [20]:
def build_txt_edges(entities_df: pd.DataFrame, sim: NameSimilarity) -> dict[tuple[int, int], float]:
    edges = {}
    nonfiles = entities_df[entities_df["kind"] != "file"]
    strong_names = nonfiles.groupby("strong_id")["name"].apply(list).to_dict()
    strong_ids = list(strong_names.keys())
    for a_ix in range(len(strong_ids)):
        a_names = strong_names[a_ix]
        for b_ix in range(a_ix + 1, len(strong_ids)):
            b_names = strong_names[b_ix]
            score = GROUP_SIM_FN(sim, a_names, b_names)
            if score >= MIN_CORRELATION:
                edges[(a_ix, b_ix)] = score
    return edges

In [21]:
len(build_txt_edges(entities_df, similarity))

3808