# Libraries

In [1]:
import os
nnn = 1
os.environ["OMP_NUM_THREADS"] = str(nnn) # export OMP_NUM_THREADS=1
os.environ["OPENBLAS_NUM_THREADS"] = str(nnn) # export OPENBLAS_NUM_THREADS=1
os.environ["MKL_NUM_THREADS"] = str(nnn) # export MKL_NUM_THREADS=1
os.environ["VECLIB_MAXIMUM_THREADS"] = str(nnn) # export VECLIB_MAXIMUM_THREADS=1
os.environ["NUMEXPR_NUM_THREADS"] = str(nnn)  # export NUMEXPR_NUM_THREADS=1

In [2]:
from TELF.factorization.HNMFk import HNMFk
from TELF.pre_processing import Beaver
from TELF.pre_processing.Vulture.tokens_analysis.top_words import get_top_words

In [3]:
from TELF.pre_processing import Vulture
from TELF.pre_processing.Vulture.modules import SimpleCleaner
from TELF.pre_processing.Vulture.modules import LemmatizeCleaner
from TELF.pre_processing.Vulture.modules import RemoveNonEnglishCleaner
from TELF.pre_processing.Vulture.default_stop_words import STOP_WORDS
from TELF.pre_processing.Vulture.default_stop_phrases import STOP_PHRASES

In [4]:
import pandas as pd
import numpy as np
import scipy.sparse
import os

In [5]:
import TELF;TELF.__version__

'0.0.40'

# Load Data

In [6]:
df = pd.read_csv(os.path.join("..", "..", "data", "sample2.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   eid               235 non-null    object 
 1   s2id              230 non-null    object 
 2   doi               235 non-null    object 
 3   title             235 non-null    object 
 4   abstract          232 non-null    object 
 5   year              235 non-null    int64  
 6   authors           235 non-null    object 
 7   author_ids        235 non-null    object 
 8   affiliations      235 non-null    object 
 9   funding           109 non-null    object 
 10  PACs              95 non-null     object 
 11  publication_name  235 non-null    object 
 12  subject_areas     235 non-null    object 
 13  s2_authors        230 non-null    object 
 14  s2_author_ids     230 non-null    object 
 15  citations         201 non-null    object 
 16  references        191 non-null    object 
 1

# Clean

In [7]:
steps = [
    RemoveNonEnglishCleaner(ascii_ratio=0.9, stopwords_ratio=0.25),
    SimpleCleaner(stop_words = STOP_WORDS,
                  stop_phrases = STOP_PHRASES,
                  order = [
                      'standardize_hyphens',
                      'isolate_frozen',
                      'remove_copyright_statement',
                      'remove_stop_phrases',
                      'make_lower_case',
                      'remove_formulas',
                      'normalize',
                      'remove_next_line',
                      'remove_email',
                      'remove_()',
                      'remove_[]',
                      'remove_special_characters',
                      'remove_nonASCII_boundary',
                      'remove_nonASCII',
                      'remove_tags',
                      'remove_stop_words',
                      'remove_standalone_numbers',
                      'remove_extra_whitespace',
                      'min_characters',
                  ]
                 ),
    LemmatizeCleaner('spacy'),
]

In [8]:
vulture = Vulture(n_jobs=1, verbose=10)
df = vulture.clean_dataframe(df=df, 
                        columns=["abstract", "title"],
                        append_to_original_df=True,
                        concat_cleaned_cols=True
                        )

[Vulture]: Cleaning 235 documents
  0%|          | 0/1 [00:00<?, ?it/s][Vulture]: Running SimpleCleaner module

  0%|          | 0/235 [00:00<?, ?it/s]

100%|██████████| 235/235 [00:00<00:00, 339.95it/s]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235 entries, 0 to 234
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   eid                   235 non-null    object 
 1   s2id                  230 non-null    object 
 2   doi                   235 non-null    object 
 3   title                 235 non-null    object 
 4   abstract              235 non-null    object 
 5   year                  235 non-null    int64  
 6   authors               235 non-null    object 
 7   author_ids            235 non-null    object 
 8   affiliations          235 non-null    object 
 9   funding               109 non-null    object 
 10  PACs                  95 non-null     object 
 11  publication_name      235 non-null    object 
 12  subject_areas         235 non-null    object 
 13  s2_authors            230 non-null    object 
 14  s2_author_ids         230 non-null    object 
 15  citations             2

# Build Matrix

In [10]:
DATA_COLUMN = 'clean_abstract_title'
RESULTS = "result_example"
HIGHLIGHT_WORDS = ['tensor', 'cybersecurity', 'malware', 'analysis']
HIGHLIGHT_WEIGHTS = [2 for i in HIGHLIGHT_WORDS]
beaver = Beaver()
os.makedirs(RESULTS, exist_ok=True)
settings = {
    "dataset" : df,
    "target_column" : DATA_COLUMN,
    'highlighting': HIGHLIGHT_WORDS,
    'weights':HIGHLIGHT_WEIGHTS,
    "matrix_type" : "tfidf",
    "save_path" : RESULTS
}
X, vocabulary = beaver.documents_words(**settings)



In [11]:
# put the samples to columns and features (tokens) to rows
X = X.T.tocsr()
X.shape

(686, 235)

In [12]:
X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 24806 stored elements and shape (686, 235)>

In [13]:
vocabulary[:10]

array(['ability', 'abstaining', 'accelerated', 'accuracy', 'accurate',
       'achieved', 'acquisition', 'activation', 'activity', 'adding'],
      dtype=object)

In [14]:
assert X.shape[1] == len(df)

# Custom Callback Class

In [15]:
class CustomSemanticCallback:

    def __init__(self, 
                 df:pd.DataFrame, 
                 vocabulary:list,
                 target_column="clean_abstract_title",
                 options={"min_df": 5, "max_df": 0.5},
                 matrix_type="tfidf") -> None:
        #
        # Data
        #
        self.df = df
        self.vocabulary = vocabulary
        
        #
        # Beaver Settings
        #
        self.target_column = target_column
        # use the same vocabulary on each NMFk decomposition
        options["vocabulary"] = self.vocabulary
        self.options = options
        self.matrix_type = matrix_type
    
    def __call__(self, original_indices:np.ndarray) -> scipy.sparse.csr_matrix:
        
        current_beaver = Beaver()
        current_df = self.df.iloc[original_indices].copy()
        current_beaver_matrix_settings = {
            "dataset":current_df,
            "target_column":self.target_column,
            "options":self.options,
            "matrix_type":self.matrix_type,
            "save_path":None
        }
        current_X, vocab = current_beaver.documents_words(**current_beaver_matrix_settings)
        # put the samples to columns and features (tokens) to rows
        current_X = current_X.T.tocsr()
        # below dictionary can be used to save additional information in each node
        save_at_node = {"vocab":vocab}
        return current_X, save_at_node

In [16]:
custom_callback = CustomSemanticCallback(df=df, vocabulary=vocabulary)

# Settings

In [17]:
Ks = np.arange(1, 11, 1)
perts = 2
iters = 200
eps = 0.005
init = "nnsvd"
save_path = "Semantic_HNMFk_results_path"
name = "example_Semantic_HNMFk"

In [18]:
nmfk_params = {
    "n_perturbs":perts,
    "n_iters":iters,
    "epsilon":eps,
    "n_jobs":-1,
    "init":init, 
    "use_gpu":False,
    "save_path":save_path, 
    "predict_k_method":"sill",
    "verbose":False,
    "nmf_verbose":False,
    "transpose":False,
    "sill_thresh":0.8,
    "pruned":True,
    'nmf_method':'nmf_fro_mu',
    "calculate_error":False,
    "use_consensus_stopping":0,
    "calculate_pac":False,
    "consensus_mat":False,
    "perturb_type":"uniform",
    "perturb_multiprocessing":False,
    "perturb_verbose":False,
    "simple_plot":True
}

In [19]:
hnmfk_params = {
    # This can be used to re-generate the data matrix X before each NMFk operation. 
    # When not used, slice of original X is taken, which is equal to serial decomposition.
    # ``generate_X_callback`` object should be a class with ``def __call__(original_indices)`` 
    # defined so that ``new_X, save_at_node=generate_X_callback(original_indices)`` can be done.
    # ``original_indices`` hyper-parameter is the indices of samples (columns of original X when clustering on H).
    # Here ``save_at_node`` is a dictionary that can be used to save additional information in each 
    # node's ``user_node_data`` variable. 
    "generate_X_callback":custom_callback,
    # we can specify nmfk parameters for each depth, or use same for all depth
    # below will use the same nmfk parameters for all depths
    # when using for each depth, append to the list 
    # for example, [nmfk_params0, nmfk_params1, nmfk_params2] for depth of 2
    "nmfk_params": [nmfk_params], 
    # where to perform clustering, can be W or H
    # if W, row of X should be samples
    # if H, columns of X should be samples
    "cluster_on":"H",
    # how deep to go in each topic after root node
    # if -1, it goes until samples cannot be seperated further
    "depth":2,
    # stopping criteria for num of samples
    "sample_thresh":5,
    # if K2=True, decomposition is done only for k=2 instead of 
    # finding and predicting the number of stable latent features
    "K2":False,
    # after first nmfk, when selecting Ks search range, minimum k to start
    "Ks_deep_min":1,
    # After first nmfk, when selecting Ks search range, maximum k to try.
    # When None, maximum k will be same as k selected for parent node.
    "Ks_deep_max": None,
    # after first nmfk, when selecting Ks search range, k step size
    "Ks_deep_step":1,
    # where to save
    "experiment_name":name,
    # What naming convention to be used for root node.
    "root_node_name":"Root"
}

# Run HNMFk

In [20]:
model = HNMFk(**hnmfk_params)
model.fit(X, Ks, from_checkpoint=False, save_checkpoint=True)



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Done


{'time': 11.705806255340576}

# Example Traverse Graph

 Below functions can be utilized to walk the graph:

 ```python
model.traverse_nodes()
model.go_to_root()
model.get_node()
model.go_to_parent()
model.go_to_children(idx:int)
model.go_to_node(node_name:str)
model.traverse_tiny_leaf_topics(threshold:int)
model.process_tiny_leaf_topics(threshold:int)
model.get_tiny_leaf_topics()
 ```

 We can reset the iterator to go back to the root node as shown below:

In [21]:
node = model.go_to_root()
node["node_name"]

'Root'

 HNMFk class includes a iterator that enables walking the graph nodes. Current node the iterator is at can be obtained as shown below (always starts at root node):

In [22]:
node = model.get_node()
node.keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

 We can also see the name of the node:

In [23]:
node["node_name"]

'Root'

 And we can see the child nodes:

In [24]:
node["child_node_names"]

['Root_0',
 'Root_1',
 'Root_2',
 'Root_3',
 'Root_4',
 'Root_5',
 'Root_6',
 'Root_7',
 'Root_8',
 'Root_9']

 We can go to the child node specified with an index. For example, to go to the first child, we index at 0. When we go to the child node, it will return the child node and set the iterator to the child node.

In [25]:
node = model.go_to_children(1)
node["node_name"]

'Root_1'

In [26]:
node.keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

 Take a look at the parent node, which should be the root:

In [27]:
node["parent_node_name"]

'Root'

get_node() always returns the node we are currently at:

In [28]:
node = model.get_node()
node["node_name"]

'Root_1'

Go back to parent:

In [29]:
node = model.go_to_parent()
node["node_name"]

'Root'

 From each node, we can get the samples that was clustered in the node:

In [30]:
node["original_indices"][:5], len(node["original_indices"])

(array([0, 1, 2, 3, 4]), 235)

We can also check if a given node in the graph a leaf node

In [31]:
node["leaf"]

False

 Finally, we can obtain all the nodes using the following method. Note that while other other node iterator options above are online, meaning each node is loaded into memory one at a time, the following traversal will load all nodes into the memory:

In [32]:
all_nodes = model.traverse_nodes()
len(all_nodes)

89

In [33]:
all_nodes[0].keys()

dict_keys(['node_name', 'depth', 'W', 'H', 'k', 'parent_topic', 'parent_node_k', 'parent_node_name', 'child_node_names', 'original_child_node_names', 'original_indices', 'num_samples', 'leaf', 'user_node_data', 'cluster_indices_in_parent', 'node_save_path', 'parent_node_factors_path', 'parent_node_save_path', 'exception', 'signature', 'probabilities', 'centroids', 'factors_path'])

# Organize Leaf Nodes

In [34]:
leaf_nodes = []

for node in all_nodes:
    if node["leaf"]:
        current_documents = df.iloc[node["original_indices"]]
        top_1grams = get_top_words(current_documents["clean_abstract_title"].to_dict(), top_n=50, n_gram=1, verbose=False, filename=None)
        top_2grams = get_top_words(current_documents["clean_abstract_title"].to_dict(), top_n=50, n_gram=2, verbose=False, filename=None)
        top_3grams = get_top_words(current_documents["clean_abstract_title"].to_dict(), top_n=50, n_gram=3, verbose=False, filename=None)

        leaf_nodes.append({   
            "node_name":node["node_name"], 
            "depth":node["depth"],
            "W":node["W"],
            "H":node["H"],
            "user_node_data":node["user_node_data"],
            "parent_node_name":node["parent_node_name"], 
            "child_node_names":node["child_node_names"], 
            "num_samples":node["num_samples"], 
            "sample_indices":node["original_indices"],
            "documents":current_documents,
            "top_1grams":top_1grams,
            "top_2grams":top_2grams,
            "top_3grams":top_3grams,
        })
        
len(leaf_nodes)

78

### Look at top 2-grams in each leaf node

In [35]:
leaf_nodes[0]["top_3grams"].head(10)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,ware classification malware,2,1,1.0,0.028169
1,classification malware family,2,1,1.0,0.028169
2,malware family classification,2,1,1.0,0.028169
3,interpretability crucial trust,1,1,1.0,0.014085
4,crucial trust deployment,1,1,1.0,0.014085
5,trust deployment choice,1,1,1.0,0.014085
6,deployment choice activation,1,1,1.0,0.014085
7,choice activation impacts,1,1,1.0,0.014085
8,activation impacts learning,1,1,1.0,0.014085
9,impacts learning ability,1,1,1.0,0.014085


In [36]:
leaf_nodes[1]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,ware classification malware,2,1,1.0,0.020619
1,classification malware family,2,1,1.0,0.020619
2,malware family classification,2,1,1.0,0.020619
3,plays crucial role,2,1,1.0,0.020619
4,crucial role improving,2,1,1.0,0.020619


In [37]:
leaf_nodes[2]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,confusion matrix detailed,2,1,1.0,0.020619
1,matrix detailed breakdown,2,1,1.0,0.020619
2,detailed breakdown classification,2,1,1.0,0.020619
3,breakdown classification models,2,1,1.0,0.020619
4,ware classification malware,2,1,1.0,0.020619


In [38]:
leaf_nodes[3]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,ware classification malware,2,1,1.0,0.017857
1,classification malware family,2,1,1.0,0.017857
2,malware family classification,2,1,1.0,0.017857
3,activation neural networks,1,1,1.0,0.008929
4,neural networks introduces,1,1,1.0,0.008929


In [39]:
leaf_nodes[4]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,ware classification malware,2,1,1.0,0.019048
1,classification malware family,2,1,1.0,0.019048
2,malware family classification,2,1,1.0,0.019048
3,reinforcement learning enables,2,1,1.0,0.019048
4,activation neural networks,2,1,1.0,0.019048


In [40]:
leaf_nodes[5]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,ware classification malware,10,4,1.0,0.05618
1,classification malware family,10,4,1.0,0.05618
2,malware family classification,10,4,1.0,0.05618
3,malware dangerous costly,5,4,1.0,0.02809
4,dangerous costly cyber,5,4,1.0,0.02809


In [41]:
leaf_nodes[6]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,multi-modal learning speech,2,1,1.0,0.01227
1,learning speech vision,2,1,1.0,0.01227
2,principal analysis pca,1,1,1.0,0.006135
3,analysis pca reducing,1,1,1.0,0.006135
4,pca reducing dimensionality,1,1,1.0,0.006135


In [42]:
leaf_nodes[7]["top_3grams"].head(5)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,principal analysis pca,1,1,1.0,0.006369
1,analysis pca reducing,1,1,1.0,0.006369
2,pca reducing dimensionality,1,1,1.0,0.006369
3,reducing dimensionality preserving,1,1,1.0,0.006369
4,dimensionality preserving variance,1,1,1.0,0.006369


In [43]:
leaf_nodes[8]["top_3grams"].head(10)

Unnamed: 0,word,tf,df,df_fraction,tf_fraction
0,machine learning models,2,1,1.0,0.013072
1,cybersecurity frameworks nist,2,1,1.0,0.013072
2,frameworks nist guidelines,2,1,1.0,0.013072
3,nist guidelines risk,2,1,1.0,0.013072
4,guidelines risk assessment,2,1,1.0,0.013072
5,risk assessment mitigation,2,1,1.0,0.013072
6,hyperparameter tuning optimal,1,1,1.0,0.006536
7,tuning optimal machine,1,1,1.0,0.006536
8,optimal machine learning,1,1,1.0,0.006536
9,learning models adversarial,1,1,1.0,0.006536
