In [1]:
import sys
import os
import time
import sqlite3
import math
from pathlib import Path
from itertools import chain
from collections import Counter
from typing import Callable

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import DBSCAN
from ordered_set import OrderedSet as oset

sys.path.append("..")
from filesplitter import db, subjects, naming
from filesplitter.clustering import cluster_dataset, to_name_cluster_labels
from filesplitter.loading import load_dataset
from filesplitter.validate import validate

First, define some basic notation.

- Let $D$ be the set of documents (identifiers) and let $T$ be the set of terms.

- Let $S \subseteq T \times D$ be the set of term-document occurrences (or _sample points_.) If `allow_dup_docs = True`, then $S$ is a multi-set.

Next, define some useful subsets.

- Let $S_i = \{s : s = (t,d) \in S \text{ and } t=t_i\}$ be the subset of occurrences that contain term $t_i$.

- Let $S_j = \{s : s = (t,d) \in S \text{ and } d=d_j\}$ be the subset of occurrences that contain document $d_j$.

- Let $S_{ij} = \{s : s = (t,d) \in S \text{ and } t=t_i \text{ and } d=d_j\}$ be the subset of occurrences that contain both term $t_i$ and document $d_j$.

And some useful shorthand for refering the size of these subsets.

- Let $c = |S|$ be the number of term-document pairs.

- Let $c_i = |S_i|$ be the number of documents that contain term $t_i$.

- Let $c_j = |S_j|$ be the number of terms that are contained within document $d_j$.

- Let $c_{ij} = |S_{ij}|$ be the number of times term $t_i$ is used by document $d_j$.

Now define some random variables.

- Let $X_i(s)$ for some $s = (t, d) \in S$ be the indicator random variable $\mathbf{1}_{S_i}$.

- Let $Y_j(s)$ for some $s = (t, d) \in S$ be the indicator random variable $\mathbf{1}_{S_j}$.

If we assume each sample point $s \in S$ is equiprobable, we can find these joint probability distributions.

- $P(X_i = 1; Y_j = 1) = p_{ij}(1,1) = c_{ij}/c$

- $P(X_i = 1; Y_j = 0) = p_{ij}(1,0) = (c_i - c_{ij}) / c$

- $P(X_i = 0; Y_j = 1) = p_{ij}(0,1) = (c_j - c_{ij}) / c$

- $P(X_i = 0; Y_j = 0) = p_{ij}(0,0) = (c + c_{ij} - c_i - c_j) / c$

And then the marginals.

- $P(X_i = 1) = p_{i}(1) = c_i / c$

- $P(X_i = 0) = p_{i}(0) = 1 - c_i / c = (c - c_i) / c$

- $P(Y_j = 1) = p_{j}(1) = c_j / c$

- $P(Y_j = 0) = p_{j}(0) = 1 - c_j / c = (c - c_j) / c$

Now we can derive the mutual information between any $X_i$ and $Y_j$.

\begin{align*}
I(X_i;Y_j)
&=
\sum_{y \in \{0,1\}}\sum_{x \in \{0,1\}}
P(X_i = x; Y_j = y)
\log{\left(\frac{P(X_i = x; Y_j = y)}{P(X_i = x)P(Y_j = y)}\right)} \\

&=
\sum_{y \in \{0,1\}}\sum_{x \in \{0,1\}}
p_{ij}(x,y)
\log{\left(\frac{p_{ij}(x,y)}{p_i(x)p_j(y)}\right)} \\

&=
p_{ij}(1,1)
\log{\left(\frac{p_{ij}(1,1)}{p_i(1)p_j(1)}\right)}
+
p_{ij}(1,0)
\log{\left(\frac{p_{ij}(1,0)}{p_i(1)p_j(0)}\right)}
+
p_{ij}(0,1)
\log{\left(\frac{p_{ij}(0,1)}{p_i(0)p_j(1)}\right)}
+
p_{ij}(0,0)
\log{\left(\frac{p_{ij}(0,0)}{p_i(0)p_j(0)}\right)} \\

&=
(c_{ij}/c)
\log{\left(\frac{c_{ij}/c}{(c_i / c)(c_j / c)}\right)}
+
((c_i - c_{ij}) / c)
\log{\left(\frac{(c_i - c_{ij}) / c}{(c_i / c)((c - c_j) / c)}\right)}
+
((c_j - c_{ij}) / c)
\log{\left(\frac{(c_j - c_{ij}) / c}{((c - c_i) / c)(c_j / c)}\right)}
+
((c + c_{ij} - c_i - c_j) / c)
\log{\left(\frac{(c + c_{ij} - c_i - c_j) / c}{((c - c_i) / c)((c - c_j) / c)}\right)} \\

&=
c^{-1}\left(
c_{ij}\ln\left(\frac{cc_{ij}}{c_ic_j}\right)
+
(c_i-c_{ij})\ln\left(\frac{c(c_i - c_{ij})}{c_i(c - c_j)}\right)
+
(c_j-c_{ij})\ln\left(\frac{c(c_j - c_{ij})}{c_j(c - c_i)}\right)
+
(c + c_{ij} - c_i - c_j)\ln\left(\frac{c(c+c_{ij}-c_{i}-c_{j})}{(c-c_{i})(c-c{j})}\right)
\right)
\end{align*}

In [2]:
DS = subjects.load_subject(subjects.ANDROID_BASE_TEXT_VIEW)

In [18]:
TEXT_EPS = 0.35
TEXT_MIN_PTS = 3
ALLOW_DUP_NAMES = True

In [19]:
# ...
entities_df = DS.entities_df()
edges = oset((r["src_id"], r["tgt_id"]) for _, r in DS.deps_df().iterrows())

# Create a "name_id" for each entity that groups targets according to their name
# entities_df["name_id"] = entities_df.groupby("name").ngroup()

# Cluster by name
similarity = naming.NameSimilarity(list(DS.targets_df["name"]), allow_dup_names=ALLOW_DUP_NAMES)
clustering = DBSCAN(eps=TEXT_EPS, min_samples=TEXT_MIN_PTS, metric="precomputed").fit(similarity.dist_mat)
entities_df["name_cluster_id"] = to_name_cluster_labels(entities_df, similarity, clustering.labels_)

# Print cluster info
n_clusters = max(*clustering.labels_) + 1
max_cluster_len = sp.stats.mode([l for l in clustering.labels_ if l >= 0], keepdims=False).count
print("Found {} text clusters with a max size of {}.".format(n_clusters, max_cluster_len))

Found 36 text clusters with a max size of 14.


In [20]:
similarity.sim("LOG_TAG", "logCursor")

0.5367276396126864

In [23]:
similarity.most_sim("getCompoundPaddingTop", 10)

[('top_pad_offset', 0.7100504102581341),
 ('total_pad_top', 0.6447012565212688),
 ('extend_pad_top', 0.6131523086266681),
 ('compound_drawabl_pad', 0.5781845830131089),
 ('fade_top', 0.5429261623597115),
 ('compound_pad_start', 0.48466408946957457),
 ('compound_pad_right', 0.47707748489327506),
 ('compound_pad_left', 0.47707748489327473),
 ('pad', 0.46324654398322196)]

In [22]:
DS.targets_df

Unnamed: 0_level_0,parent_id,name,kind,start_row,end_row
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
492564,1323,LOG_TAG,field,368,368
492565,1323,DEBUG_EXTRACT,field,369,369
492566,1323,DEBUG_CURSOR,field,370,370
492567,1323,TEMP_POSITION,field,372,372
492568,1323,XMLTypefaceAttr,annotation,377,379
...,...,...,...,...,...
289503,1323,onInputConnectionOpenedInternal,method,14211,14218
289505,1323,onInputConnectionClosedInternal,method,14221,14226
246325,1323,onReceiveContent,method,14245,14252
493175,1323,logCursor,method,14254,14260
