# Libraries

In [1]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

In [2]:
from TELF.factorization.HNMFk import HNMFk
from TELF.pre_processing import Beaver
from TELF.pre_processing.Vulture.tokens_analysis.top_words import get_top_words

In [3]:
import pandas as pd
import numpy as np
import scipy.sparse

# Load Data

In [4]:
df = pd.read_csv("../../data/sample.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Matrix

In [5]:
beaver = Beaver()

beaver_vocab_settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "split_with":None,
    "min_df":10,
    "max_df":0.5,
    "verbose":False,
    "n_jobs":-1,
    "parallel_backend":"multiprocessing"
}

vocabulary = beaver.get_vocabulary(**beaver_vocab_settings)
len(vocabulary)

466

In [6]:
vocabulary[:5]

['128pb', '32x', 'ability', 'abstain', 'accelerate']

In [7]:
beaver_matrix_settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "options":{"min_df": 5, "max_df": 0.5, "vocabulary":vocabulary},
    "matrix_type":"tfidf",
    "save_path":None
}

X, _ = beaver.documents_words(**beaver_matrix_settings)

In [8]:
# put the samples to columns and features (tokens) to rows
X = X.T.tocsr()
X.shape

(466, 940)

In [9]:
X

<466x940 sparse matrix of type '<class 'numpy.float32'>'
	with 72279 stored elements in Compressed Sparse Row format>

In [10]:
assert X.shape[1] == len(df)

# Custom Callback Class

In [11]:
class CustomSemanticCallback:

    def __init__(self, 
                 df:pd.DataFrame, 
                 vocabulary:list,
                 target_column="clean_abstract",
                 options={"min_df": 5, "max_df": 0.5},
                 matrix_type="tfidf") -> None:
        #
        # Data
        #
        self.df = df
        self.vocabulary = vocabulary
        
        #
        # Beaver Settings
        #
        self.target_column = target_column
        # use the same vocabulary on each NMFk decomposition
        options["vocabulary"] = self.vocabulary
        self.options = options
        self.matrix_type = matrix_type
    
    def __call__(self, original_indices:np.ndarray) -> scipy.sparse.csr_matrix:
        
        current_beaver = Beaver()
        current_df = self.df.iloc[original_indices].copy()
        current_beaver_matrix_settings = {
            "dataset":current_df,
            "target_column":self.target_column,
            "options":self.options,
            "matrix_type":self.matrix_type,
            "save_path":None
        }
        current_X, _ = current_beaver.documents_words(**current_beaver_matrix_settings)
        # put the samples to columns and features (tokens) to rows
        current_X = current_X.T.tocsr()
        return current_X

In [12]:
custom_callback = CustomSemanticCallback(df=df, vocabulary=vocabulary)

# Settings

In [13]:
Ks = np.arange(1, 21, 1)
perts = 10
iters = 500
eps = 0.015
init = "nnsvd"
save_path = "Semantic_HNMFk_results_path"
name = "example_Semantic_HNMFk"

In [14]:
nmfk_params = {
    "n_perturbs":perts,
    "n_iters":iters,
    "epsilon":eps,
    "n_jobs":-1,
    "init":init, 
    "use_gpu":False,
    "save_path":save_path, 
    "predict_k_method":"sill",
    "verbose":False,
    "nmf_verbose":False,
    "transpose":False,
    "sill_thresh":0.8,
    "pruned":True,
    'nmf_method':'nmf_fro_mu',
    "calculate_error":False,
    "use_consensus_stopping":0,
    "calculate_pac":False,
    "consensus_mat":False,
    "perturb_type":"uniform",
    "perturb_multiprocessing":False,
    "perturb_verbose":False,
    "simple_plot":True
}

In [15]:
hnmfk_params = {
    # This can be used to re-generate the data matrix X before each NMFk operation. 
    # When not used, slice of original X is taken, which is equal to serial decomposition.
    # ``generate_X_callback`` object should be a class with ``def __call__(original_indices)`` defined
    # so that ``new_X=generate_X_callback(original_indices)`` can be done.
    # ``original_indices`` hyper-parameter is the indices of samples (columns of original X when clustering on H).
    "generate_X_callback":custom_callback,
    # we can specify nmfk parameters for each depth, or use same for all depth
    # below will use the same nmfk parameters for all depths
    # when using for each depth, append to the list 
    # for example, [nmfk_params0, nmfk_params1, nmfk_params2] for depth of 2
    "nmfk_params": [nmfk_params], 
    # where to perform clustering, can be W or H
    # if W, row of X should be samples
    # if H, columns of X should be samples
    "cluster_on":"H",
    # how deep to go in each topic after root node
    # if -1, it goes until samples cannot be seperated further
    "depth":2,
    # stopping criteria for num of samples
    "sample_thresh":5,
    # if K2=True, decomposition is done only for k=2 instead of 
    # finding and predicting the number of stable latent features
    "K2":False,
    # after first nmfk, when selecting Ks search range, minimum k to start
    "Ks_deep_min":1,
    # After first nmfk, when selecting Ks search range, maximum k to try.
    # When None, maximum k will be same as k selected for parent node.
    "Ks_deep_max": None,
    # after first nmfk, when selecting Ks search range, k step size
    "Ks_deep_step":1,
    # where to save
    "experiment_name":name
}

# Run HNMFk

In [16]:
model = HNMFk(**hnmfk_params)
model.fit(X, Ks)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Example Traverse Graph

 Below functions can be utilized to walk the graph:

 ```python
model.traverse_nodes()
model.go_to_root()
model.get_node()
model.go_to_parent()
model.go_to_children(idx:int)
 ```

 We can reset the iterator to go back to the root node as shown below:

In [17]:
node = model.go_to_root()
node["name"]

'node0'

 HNMFk class includes a iterator that enables walking the graph nodes. Current node the iterator is at can be obtained as shown below (always starts at root node):

In [18]:
node = model.get_node()
node.keys()

dict_keys(['node_num', 'name', 'node_id', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_name', 'child_node_names', 'original_indices', 'num_samples', 'leaf'])

 We can also see the name of the node:

In [19]:
node["name"]

'node0'

 And we can see the child nodes:

In [20]:
node["child_node_names"]

['node1', 'node4']

 We can go to the child node specified with an index. For example, to go to the first child, we index at 0. When we go to the child node, it will return the child node and set the iterator to the child node.

In [21]:
node = model.go_to_children(1)
node["name"]

'node4'

In [22]:
node.keys()

dict_keys(['node_num', 'name', 'node_id', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_name', 'child_node_names', 'original_indices', 'num_samples', 'leaf'])

 Take a look at the parent node, which should be the root:

In [23]:
node["parent_node_name"]

'node0'

get_node() always returns the node we are currently at:

In [24]:
node = model.get_node()
node["name"]

'node4'

Go back to parent:

In [25]:
node = model.go_to_parent()
node["name"]

'node0'

 From each node, we can get the samples that was clustered in the node:

In [26]:
node["original_indices"][:5]

array([0, 1, 2, 3, 4])

We can also check if a given node in the graph a leaf node

In [27]:
node["leaf"]

False

 Finally, we can obtain all the nodes using the following method:

In [28]:
all_nodes = model.traverse_nodes()
len(all_nodes)

7

In [29]:
all_nodes[0]

{'node_num': 2,
 'name': 'node2',
 'node_id': 'a60bc910-f868-11ee-af0e-7cc25582da66',
 'depth': 2,
 'W': array([[0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00],
        [1.28648204e-13, 6.84742164e-03],
        [1.28647893e-13, 6.84641954e-03],
        [1.28724871e-13, 1.36939092e-02],
        [1.28648082e-13, 6.84892293e-03],
        [5.27898083e-03, 1.46872684e-04],
        [8.18576477e-03, 1.27560844e-13],
        [0.00000000e+00, 0.00000000e+00],
        [8.19000416e-03, 1.27560899e-13],
        [0.00000000e+00, 0.00000000e+00],
        [1.28725020e-13, 1.36969667e-02],
        [0.00000000e+00, 0.00000000e+00],
        [1.28648137e-13, 6.84870966e-03],
        [0.00000000e+00, 0.00000000e+00],
        [1.28724993e-13, 1.36947362e-02],
        [0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00],
        [1.287

# Organize Leaf Nodes

In [35]:
leaf_nodes = []

for node in all_nodes:
    if node["leaf"]:
        current_documents = df.iloc[node["original_indices"]]
        top_1grams = get_top_words(current_documents["clean_abstract"].to_dict(), top_n=50, n_gram=1, verbose=False, filename=None)
        top_2grams = get_top_words(current_documents["clean_abstract"].to_dict(), top_n=50, n_gram=2, verbose=False, filename=None)

        leaf_nodes.append({   
            "name":node["name"], 
            "depth":node["depth"],
            "parent_node_name":node["parent_node_name"], 
            "child_node_names":node["child_node_names"], 
            "num_samples":node["num_samples"], 
            "sample_indices":node["original_indices"],
            "documents":current_documents,
            "top_1grams":top_1grams,
            "top_2grams":top_2grams,
        })
        
len(leaf_nodes)

4

### Look at top 2-grams in each leaf node

In [38]:
leaf_nodes[0]["top_2grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,scada system,452,113,0.356467,1.477124
1,tensor decomposition,339,113,0.356467,1.107843
2,machine learn,311,212,0.66877,1.01634
3,build dataset,297,99,0.312303,0.970588
4,grid system,226,113,0.356467,0.738562


In [39]:
leaf_nodes[1]["top_2grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,latency associate,434,217,1.0,2.384615
1,collective communication,333,217,1.0,1.82967
2,sparse matrix,333,217,1.0,1.82967
3,memory complex,318,217,1.0,1.747253
4,batch copy,318,217,1.0,1.747253


In [40]:
leaf_nodes[2]["top_2grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,sub topic,500,100,1.0,3.311258
1,topic model,300,100,1.0,1.986755
2,hierarchical senmfk,300,100,1.0,1.986755
3,text document,200,100,1.0,1.324503
4,document matrix,200,100,1.0,1.324503


In [41]:
leaf_nodes[3]["top_2grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,malware family,1294,306,1.0,3.387435
1,novel malware,604,306,1.0,1.581152
2,semi supervise,384,188,0.614379,1.005236
3,malware dna,360,90,0.294118,0.942408
4,family classification,326,208,0.679739,0.853403
