# Libraries

In [1]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

In [2]:
from TELF.factorization.HNMFk import HNMFk
import numpy as np
import os

In [3]:
import sys; sys.path.append(os.path.join("..", "..", "scripts"))
from generate_X import gen_data,gen_data_sparse

In [4]:
import TELF
TELF.__version__

'0.0.40'

# Generate Synthetic Data

In [5]:
Xsp = gen_data_sparse(shape=[500, 500], density=0.01)["X"]
X = gen_data(R=4, shape=[500, 500])["X"]



# Settings

In [6]:
Ks = np.arange(1, 10, 1)
perts = 2
iters = 1000
eps = 0.015
init = "nnsvd"
save_path = "HNMFk_results_path"
name = "example_HNMFk3"

In [7]:
nmfk_params = {
    "n_perturbs":perts,
    "n_iters":iters,
    "epsilon":eps,
    "n_jobs":2,
    "init":init, 
    "use_gpu":False,
    "save_path":save_path, 
    "predict_k_method":"sill",
    "verbose":False,
    "nmf_verbose":False,
    "transpose":False,
    "sill_thresh":0.8,
    "pruned":True,
    'nmf_method':'nmf_fro_mu',
    "calculate_error":False,
    "use_consensus_stopping":0,
    "calculate_pac":False,
    "consensus_mat":False,
    "perturb_type":"uniform",
    "perturb_multiprocessing":False,
    "perturb_verbose":False,
    "simple_plot":True,
    "k_search_method":"bst_post",
    "H_sill_thresh":0.1
}

In [8]:
hnmfk_params = {
    # we can specify nmfk parameters for each depth, or use same for all depth
    # below will use the same nmfk parameters for all depths
    # when using for each depth, append to the list 
    # for example, [nmfk_params0, nmfk_params1, nmfk_params2] for depth of 2
    "nmfk_params": [nmfk_params], 
    # where to perform clustering, can be W or H
    # if W, row of X should be samples
    # if H, columns of X should be samples
    "cluster_on":"H",
    # how deep to go in each topic after root node
    # if -1, it goes until samples cannot be seperated further
    "depth":3,
    # stopping criteria for num of samples
    "sample_thresh":100,
    # if K2=True, decomposition is done only for k=2 instead of 
    # finding and predicting the number of stable latent features
    "K2":False,
    # after first nmfk, when selecting Ks search range, minimum k to start
    "Ks_deep_min":1,
    # After first nmfk, when selecting Ks search range, maximum k to try.
    # When None, maximum k will be same as k selected for parent node.
    "Ks_deep_max": 20,
    # after first nmfk, when selecting Ks search range, k step size
    "Ks_deep_step":1,
    # where to save
    "experiment_name":os.path.join("results", name),
    # when True, names the nodes randomly.
    # When False, uses k index for ancestry naming
    "random_identifiers":False,
    # What naming convention to be used for root node.
    "root_node_name":"Root"
}

In [9]:
model = HNMFk(**hnmfk_params)
model.fit(X, Ks, from_checkpoint=True, save_checkpoint=True)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Done


{'time': 5.2496418952941895}

# Example Traverse Graph

 Below functions can be utilized to walk the graph:

 ```python
model.traverse_nodes()
model.go_to_root()
model.get_node()
model.go_to_parent()
model.go_to_children(idx:int)
model.go_to_node(node_name:str)
model.traverse_tiny_leaf_topics(threshold:int)
model.process_tiny_leaf_topics(threshold:int)
model.get_tiny_leaf_topics()
 ```

 We can reset the iterator to go back to the root node as shown below:

In [10]:
node = model.go_to_root()
node["node_name"]

'Root'

 HNMFk class includes a iterator that enables walking the graph nodes. Current node the iterator is at can be obtained as shown below (always starts at root node):

In [11]:
node = model.get_node()
node.keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

 We can also see the name of the node:

In [12]:
node["node_name"]

'Root'

 And we can see the child nodes:

In [13]:
node["child_node_names"]

['Root_0', 'Root_1', 'Root_2', 'Root_3']

 We can go to the child node specified with an index. For example, to go to the first child, we index at 0. When we go to the child node, it will return the child node and set the iterator to the child node.

In [14]:
node = model.go_to_children(1)
node["node_name"]

'Root_1'

In [15]:
node["factors_path"]

In [16]:
node.keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

In [17]:
node["node_save_path"]

'results/example_HNMFk3/depth_1/Root_1/node_Root_1.p'

In [18]:
node["parent_node_save_path"]

'results/example_HNMFk3/depth_0/Root/node_Root.p'

Go to a specific node

In [19]:
node = model.go_to_node("Root_3")
node["child_node_names"]

['Root_3_0', 'Root_3_1', 'Root_3_2', 'Root_3_3']

 Take a look at the parent node, which should be the root:

In [20]:
node["parent_node_name"]

'Root'

get_node() always returns the node we are currently at:

In [21]:
node = model.get_node()
node["node_name"]

'Root_3'

Go back to parent:

In [22]:
node = model.go_to_parent()
node["node_name"]

'Root'

 From each node, we can get the samples that was clustered in the node:

In [23]:
node["original_indices"][:5]

array([0, 1, 2, 3, 4])

We can also look at which of the parent H clustering indices resulted in the samples in current cluster.

In [24]:
node = model.go_to_node("Root_3_0")
print("Original indices from X=", node["original_indices"])
print("H clustering indices from parent topic=", node["cluster_indices_in_parent"])

Original indices from X= [ 32  38  55  59 140 176 178 204 274 299 305 338 345 413 415 477 485 489
 496]
H clustering indices from parent topic= [ 27  32  47  50 121 152 153 174 237 260 265 296 303 361 363 415 422 424
 430]


Root is empty because it does not have a parent:

In [25]:
node = model.go_to_root()
print("H clustering indices from parent topic=", node["cluster_indices_in_parent"])

H clustering indices from parent topic= []


We can also check if a given node in the graph a leaf node

In [26]:
node["leaf"]

False

 Finally, we can obtain all the nodes using the following method. Note that while other other node iterator options above are online, meaning each node is loaded into memory one at a time, the following traversal will load all nodes into the memory:

In [27]:
all_nodes = model.traverse_nodes()
len(all_nodes)

13

In [28]:
indices = []
leaf_nodes = []
for node in all_nodes:
    if node["leaf"]:
        indices += list(node["original_indices"])
        leaf_nodes.append(node)
indices.sort()
assert all(indices == np.arange(0, X.shape[1], 1))

In [29]:
len(leaf_nodes)

10

In [30]:
leaf_nodes[0].keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

In [31]:
leaf_nodes[0]["centroids"].shape

(4, 16)

In [32]:
leaf_nodes[0]["signature"].shape

(500,)

In [33]:
leaf_nodes[0]["probabilities"].shape

(16,)

In [34]:
model.go_to_node("Root_2").keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

# Processing the graph for nodes with small number of documents

Look at outlier clusters where the number of documents are less than the given threshold at the leafs:

In [35]:
threshold = 5

for node in model.traverse_tiny_leaf_topics(threshold):
    print("node_name=", node["node_name"])
    print("num_samples=", node["num_samples"])
    print("leaf=", node["leaf"])
    print("parent_node_name=", node["parent_node_name"])
    print("---------")

node_name= Root_3_1
num_samples= 2
leaf= True
parent_node_name= Root_3
---------
node_name= Root_3_3_0
num_samples= 3
leaf= True
parent_node_name= Root_3_3
---------


Above operation for traversing the graph did not make any changes to the graph. We can look at the parent of these tiny nodes to see that these nodes are still there:

In [36]:
for node in model.traverse_tiny_leaf_topics(threshold):
    print("Tiny node name= ", node["node_name"])
    print("Child nodes of the parent of tiny node= ", model.go_to_node(node["parent_node_name"])["child_node_names"])
    print("---------")

Tiny node name=  Root_3_1
Child nodes of the parent of tiny node=  ['Root_3_0', 'Root_3_1', 'Root_3_2', 'Root_3_3']
---------
Tiny node name=  Root_3_3_0
Child nodes of the parent of tiny node=  ['Root_3_3_0', 'Root_3_3_1', 'Root_3_3_2', 'Root_3_3_3']
---------


We can also process the graph to remove these nodes, and save them seperatly. If we try to load these nodes now, it will give an error because we have not run the processing.

In [37]:
model.node_save_paths

{'Root': 'results/example_HNMFk3/depth_0/Root/node_Root.p',
 'Root_0': 'results/example_HNMFk3/depth_1/Root_0/node_Root_0.p',
 'Root_1': 'results/example_HNMFk3/depth_1/Root_1/node_Root_1.p',
 'Root_2': 'results/example_HNMFk3/depth_1/Root_2/node_Root_2.p',
 'Root_3': 'results/example_HNMFk3/depth_1/Root_3/node_Root_3.p',
 'Root_3_0': 'results/example_HNMFk3/depth_2/Root_3_0/node_Root_3_0.p',
 'Root_3_1': 'results/example_HNMFk3/depth_2/Root_3_1/node_Root_3_1.p',
 'Root_3_2': 'results/example_HNMFk3/depth_2/Root_3_2/node_Root_3_2.p',
 'Root_3_3': 'results/example_HNMFk3/depth_2/Root_3_3/node_Root_3_3.p',
 'Root_3_3_0': 'results/example_HNMFk3/depth_3/Root_3_3_0/node_Root_3_3_0.p',
 'Root_3_3_1': 'results/example_HNMFk3/depth_3/Root_3_3_1/node_Root_3_3_1.p',
 'Root_3_3_2': 'results/example_HNMFk3/depth_3/Root_3_3_2/node_Root_3_3_2.p',
 'Root_3_3_3': 'results/example_HNMFk3/depth_3/Root_3_3_3/node_Root_3_3_3.p'}

In [38]:
tiny_leafs = model.get_tiny_leaf_topics()
tiny_leafs

Could not load the tiny leafs. Did you call process_tiny_leaf_topics(threshold:int)? [Errno 2] No such file or directory: 'results/example_HNMFk3/tiny_leafs.p'


Let's process the graph first then to seperate these tiny nodes based on the given threshold:

In [39]:
tiny_leafs = model.process_tiny_leaf_topics(threshold=threshold)

In [40]:
for node in tiny_leafs:
    print("node_name=", node["node_name"])
    print("num_samples=", node["num_samples"])
    print("leaf=", node["leaf"])
    print("parent_node_name=", node["parent_node_name"])
    print("---------")

node_name= Root_3_1
num_samples= 2
leaf= True
parent_node_name= Root_3
---------
node_name= Root_3_3_0
num_samples= 3
leaf= True
parent_node_name= Root_3_3
---------


Now we can also directly load them again without pre-processing the graph:

In [41]:
tiny_leafs = model.get_tiny_leaf_topics()
len(tiny_leafs)

2

They are saved in a pickle file named ```tiny_leafs.p```

In [42]:
! ls $model.experiment_name

checkpoint.p [34mdepth_0[m[m      [34mdepth_1[m[m      [34mdepth_2[m[m      [34mdepth_3[m[m      tiny_leafs.p


If we look at the parent node for these nodes now, their child node list should not have the removed tiny nodes:

In [43]:
for node in tiny_leafs:
    print("Tiny node name= ", node["node_name"])
    print("Child nodes of the parent of tiny node= ", model.go_to_node(node["parent_node_name"])["child_node_names"])
    print("---------")

Tiny node name=  Root_3_1
Child nodes of the parent of tiny node=  ['Root_3_0', 'Root_3_2', 'Root_3_3']
---------
Tiny node name=  Root_3_3_0
Child nodes of the parent of tiny node=  ['Root_3_3_1', 'Root_3_3_2', 'Root_3_3_3']
---------


If we now try to traverse the graph for these tiny nodes, we should not get any because they are removed:

In [44]:
tiny_leafs_now = model.traverse_tiny_leaf_topics(threshold=threshold)
tiny_leafs_now

[]

We cannot directly access these tiny nodes with the graph iterator anymore since they are not listed in any child node of a node:

In [45]:
try:
    model.go_to_node(tiny_leafs[0]["node_name"])["node_name"]
except Exception as e:
    print(e)

Node not found!


We can also re-process the graph with different treshold:

In [46]:
threshold=15
tiny_leafs = model.process_tiny_leaf_topics(threshold=threshold)

for node in tiny_leafs:
    print("node_name=", node["node_name"])
    print("num_samples=", node["num_samples"])
    print("leaf=", node["leaf"])
    print("parent_node_name=", node["parent_node_name"])
    print("---------")

node_name= Root_3_1
num_samples= 2
leaf= True
parent_node_name= Root_3
---------
node_name= Root_3_3_0
num_samples= 3
leaf= True
parent_node_name= Root_3_3
---------
node_name= Root_3_3_2
num_samples= 11
leaf= True
parent_node_name= Root_3_3
---------


In [47]:
tiny_leafs = model.get_tiny_leaf_topics()
len(tiny_leafs)

3

In [48]:
for node in tiny_leafs:
    print("Tiny node name= ", node["node_name"])
    print("Child nodes of the parent of tiny node= ", model.go_to_node(node["parent_node_name"])["child_node_names"])
    print("---------")

Tiny node name=  Root_3_1
Child nodes of the parent of tiny node=  ['Root_3_0', 'Root_3_2', 'Root_3_3']
---------
Tiny node name=  Root_3_3_0
Child nodes of the parent of tiny node=  ['Root_3_3_1', 'Root_3_3_3']
---------
Tiny node name=  Root_3_3_2
Child nodes of the parent of tiny node=  ['Root_3_3_1', 'Root_3_3_3']
---------


Reset the graph to add these back. Simply set the threshold to be ```None```:

In [49]:
model.process_tiny_leaf_topics(threshold=None)

Since these nodes are added back, we can traverse the graph and they will be found, and their parents will have the name of those tiny nodes:

In [50]:
for node in model.traverse_tiny_leaf_topics(threshold):
    print("Tiny node name= ", node["node_name"])
    print("Child nodes of the parent of tiny node= ", model.go_to_node(node["parent_node_name"])["child_node_names"])
    print("---------")

Tiny node name=  Root_3_1
Child nodes of the parent of tiny node=  ['Root_3_0', 'Root_3_1', 'Root_3_2', 'Root_3_3']
---------
Tiny node name=  Root_3_3_0
Child nodes of the parent of tiny node=  ['Root_3_3_0', 'Root_3_3_1', 'Root_3_3_2', 'Root_3_3_3']
---------
Tiny node name=  Root_3_3_2
Child nodes of the parent of tiny node=  ['Root_3_3_0', 'Root_3_3_1', 'Root_3_3_2', 'Root_3_3_3']
---------


We'll also see the saved tiny nodes are no longer available:

In [51]:
tiny_leafs = model.get_tiny_leaf_topics()
tiny_leafs

Could not load the tiny leafs. Did you call process_tiny_leaf_topics(threshold:int)? [Errno 2] No such file or directory: 'results/example_HNMFk3/tiny_leafs.p'


In [52]:
! ls $model.experiment_name

checkpoint.p [34mdepth_0[m[m      [34mdepth_1[m[m      [34mdepth_2[m[m      [34mdepth_3[m[m


# Loading checkpoint to have access to the graph iterator and the node objects

In [53]:
del model
model = HNMFk(experiment_name=os.path.join("results", name))
model.load_model()

Loading saved object state from checkpoint...


In [54]:
model.node_save_paths

{'Root': 'results/example_HNMFk3/depth_0/Root/node_Root.p',
 'Root_0': 'results/example_HNMFk3/depth_1/Root_0/node_Root_0.p',
 'Root_1': 'results/example_HNMFk3/depth_1/Root_1/node_Root_1.p',
 'Root_2': 'results/example_HNMFk3/depth_1/Root_2/node_Root_2.p',
 'Root_3': 'results/example_HNMFk3/depth_1/Root_3/node_Root_3.p',
 'Root_3_0': 'results/example_HNMFk3/depth_2/Root_3_0/node_Root_3_0.p',
 'Root_3_1': 'results/example_HNMFk3/depth_2/Root_3_1/node_Root_3_1.p',
 'Root_3_2': 'results/example_HNMFk3/depth_2/Root_3_2/node_Root_3_2.p',
 'Root_3_3': 'results/example_HNMFk3/depth_2/Root_3_3/node_Root_3_3.p',
 'Root_3_3_0': 'results/example_HNMFk3/depth_3/Root_3_3_0/node_Root_3_3_0.p',
 'Root_3_3_1': 'results/example_HNMFk3/depth_3/Root_3_3_1/node_Root_3_3_1.p',
 'Root_3_3_2': 'results/example_HNMFk3/depth_3/Root_3_3_2/node_Root_3_3_2.p',
 'Root_3_3_3': 'results/example_HNMFk3/depth_3/Root_3_3_3/node_Root_3_3_3.p'}

In [55]:
model.go_to_node("Root_0").keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

In [56]:
node = model.go_to_node(list(model.node_save_paths.keys())[0])
node["node_name"], node.keys()

('Root',
 dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path']))

Look at the nodes that are not completed yet

In [57]:
model.target_jobs.keys()

dict_keys([])