# Load Data

In [1]:
import pandas as pd
import os
df = pd.read_csv(os.path.join("..", "..", "data", "sample.csv"))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   eid             940 non-null    object
 1   title           940 non-null    object
 2   year            940 non-null    int64 
 3   abstract        940 non-null    object
 4   authors         940 non-null    object
 5   author_ids      940 non-null    object
 6   references      843 non-null    object
 7   clean_abstract  940 non-null    object
dtypes: int64(1), object(7)
memory usage: 58.9+ KB


# Build Matrix

In [2]:
from TELF.pre_processing import Beaver

beaver = Beaver()

beaver_vocab_settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "min_df":10,
    "max_df":0.5,
}

vocabulary = beaver.get_vocabulary(**beaver_vocab_settings)
len(vocabulary)

467

In [3]:
beaver_matrix_settings = {
    "dataset":df,
    "target_column":"clean_abstract",
    "options":{"min_df": 5, "max_df": 0.5, "vocabulary":vocabulary},
    "matrix_type":"tfidf",
    "save_path":None
}

X, _ = beaver.documents_words(**beaver_matrix_settings)

In [4]:
# put the samples to columns and features (tokens) to rows
X = X.T.tocsr()
X.shape

(467, 940)

# Factorize

In [5]:
from TELF.factorization import NMFk

params = {
    "n_perturbs":12,
    "n_iters":100,
    "epsilon":0.015, # or (0.015, 0.015) which it does automatically
    "n_jobs":-1,
    "init":"nnsvd",
    
    "use_gpu":False,
    "verbose":True,
    "nmf_verbose":False,
    "perturb_verbose":False,
    "perturb_multiprocessing":False,
    "simple_plot":True,
    "pruned":True,

    "save_path":os.path.join("..", "..", "results"), 
    "save_output":True,
    "collect_output":True,

    "transpose":False,
    "calculate_error":True,
    "predict_k":True,
    "predict_k_method":"W_sill",
    "sill_thresh":0.8,
    "H_sill_thresh":0.1,
    "k_search_method":"bst_pre",

    "use_consensus_stopping":0,
    "nmf_method":"nmf_fro_mu",
    "perturb_type":"uniform",
}

Ks = range(1,16,1)
name = "Example_SeaLion"
note = "This is an example run of WNMFk"

model = NMFk(**params)

n_perturbs: 12
perturb_type: uniform
n_iters: 100
epsilon: 0.015
init: nnsvd
save_path: ../../results
save_output: True
use_gpu: False
verbose: True
nmf_verbose: False
perturb_verbose: False
transpose: False
collect_output: True
sill_thresh: 0.8
predict_k: True
predict_k_method: W_sill
n_jobs: 12
n_nodes: 1
nmf: <function nmf at 0x1746111c0>
nmf_method: nmf_fro_mu
nmf_obj_params: {}
clustering_obj_params: {}
pruned: True
calculate_error: True
consensus_mat: False
use_consensus_stopping: 0
mask: None
calculate_pac: False
simple_plot: True
get_plot_data: False
perturb_multiprocessing: False
k_search_method: bst_pre
H_sill_thresh: 0.1
factor_thresholding: None
factor_thresholding_H_regression: None
factor_thresholding_obj_params: {}
factor_thresholding_H_regression_obj_params: {}
clustering_method: kmeans
device: [-1]
lock: <unlocked _thread.lock object at 0x1743c8a80>
K_search_settings: {'lock': <unlocked _thread.lock object at 0x1743e3f80>, 'k_search_method': 'bst_pre', 'sill_thresh': 0



In [6]:
results = model.fit(X, Ks, name, note)

Performing K search with bst_pre. Ks=[8, 4, 2, 1, 3, 6, 5, 7, 12, 10, 9, 11, 14, 13, 15]


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 15/15 [00:03<00:00,  4.20it/s]


In [7]:
results.keys()

dict_keys(['time', 'k_predict', 'W', 'H', 'other_results'])

In [8]:
results["k_predict"]

9

In [9]:
factorization_paths = [model.save_path_full]
factorization_paths

['../../results/Example_SeaLion_12perts_100iters_0.015eps_nnsvd-init']

# Prepare SeaLion Report

In [16]:
cols = []
for eid in df.eid.tolist():
    cols.append(eid[:5])

In [20]:
from TELF.post_processing.SeaLion import SeaLion
import numpy as np

settings = {
    "save_path":"SeaLion_post_processing/",
    "UNKNOWN_MASK":np.argwhere(X == 0),
    "KNOWN_MASK":np.argwhere(X != 0),
    "recommend_probabilities":False,
    "X":X.toarray(),
    "W":results["W"],
    "S":None, # When using TriNMFk, we will have S latent factors
    "H":results["H"],
    "bu":None,
    "bi":None,
    "global_mean":0,
    "rows":list(vocabulary),
    "cols":cols,
    "rows_name":"Words",
    "cols_name":"Documents",
    "num_top_words":10,
    "num_top_recommendations":10,
    "factorization_paths":factorization_paths,
    "verbose":True
}
post_processor = SeaLion(**settings)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
post_processor()

Starting general post-processing
[Errno 17] File exists: 'SeaLion_post_processing/factorization_results/Example_SeaLion_12perts_100iters_0.015eps_nnsvd-init'
Skipping getting S mixing matrix patterns
Done
