## Retrive Tensor Data for Learning

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
cats =  ['sci.med',
         'sci.space',
         'talk.politics.guns',
         'alt.atheism',]
newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'),categories=cats)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features = 1000,analyzer='word', ngram_range=(1, 1),\
                             stop_words='english',max_df = 0.8)
X_train_counts = count_vect.fit_transform(newsgroups_train.data)
X_train_counts = np.array(X_train_counts.todense())
X = np.array(X_train_counts)
M1,M2,M3, Whiten = RetrieveTensorsST(X_train_counts, k=len(cats))

In [129]:
np.save("M1_4_medspacegunath",M1)
np.save("M2_4_medspacegunath",M2)
np.save("M3_4_medspacegunath",M3)
np.save("Whiten_4_medspacegunath",Whiten)

In [3]:
M1 = np.load("M1_4_medspacegunath.npy")
M2 = np.load("M2_4_medspacegunath.npy")
M3 = np.load("M3_4_medspacegunath.npy")
Whiten = np.load("Whiten_4_medspacegunath.npy")
print(M1.shape)

(1000,)


# Comparison with SVTD on Tensor Data

In [4]:
word2id = count_vect.vocabulary_

In [16]:
from lvm_util import *
k = 4
top = 15
from time import time
coherence_dict_list = {
             "SVTD":[],
             "TPM":[],
            "DRJD":[],
}
time_dict_list = {
             "SVTD":[],
             "TPM":[],
            "DRJD":[],
}
tries = 100
for i in range(tries):
    start= time()
    M_rjd_de, omega_rjd_de = learn_LVM_RJD(M1, M2, M3, Whiten, k, N=3)
    time_dict_list['DRJD'].append(time() - start)
    coherence_dict_list['DRJD'].append(coherence(X, M_rjd_de, l = top))
    
    
    start = time()
    M_svtd, omega_svtd = learn_LVM_SVTD(M1, M2, M3, Whiten, k)
    time_dict_list['SVTD'].append(time() - start)
    coherence_dict_list['SVTD'].append(coherence(X, M_svtd, l = top))

    start = time()
    M_tpm, omega_tpm = learn_LVM_Tensor14(M2, M3, Whiten, k)
    time_dict_list['TPM'].append(time() - start)
    coherence_dict_list['TPM'].append(coherence(X, M_tpm, l = top))
def print_time_error(name, time,error, bold = False):
    if not bold:
        output_str = name + " & " + "$\\num{%f}$" + " & " + "$\\num{%f}$" + "\\\\\n" + "\hline"
    else:
        output_str = "{\\bf " + name + "}" + " & " + "$\\num{%f}$" + " & " + "$\\num{%f}$" + "\\\\\n" + "\hline"
    print(output_str % (time, error))

def make_table(time_dict, md_dict):
    title_str = "\\begin{table}[!hbt!]\n" + "\\begin{center}\n" + \
                    "\\caption{20NewsGroup - Runtime and Coherence for Single Topic Model" \
                +"}\n" +"\\begin{tabular}{||c | c | c | |}\n" + "\\hline\n"
    title_str += "Algorithm Name & Avg Runtime(ms) & Avg Coherence\\\\\n" + "\\hline"
    print(title_str)
    for algo in time_dict.keys():
        print_time_error(algo,time_dict[algo],md_dict[algo])
    closing_str = "\\end{tabular}\n" + "\\end{center}\n" + \
        "\\end{table}"
    print(closing_str)
coherence_dict = {key: np.mean(value) for key,value in coherence_dict_list.items()}
time_dict = {key: 1000 * np.mean(value) for key,value in time_dict_list.items()}
make_table(time_dict, coherence_dict)

\begin{table}[!hbt!]
\begin{center}
\caption{20NewsGroup - Runtime and Coherence for Single Topic Model}
\begin{tabular}{||c | c | c | |}
\hline
Algorithm Name & Avg Runtime(ms) & Avg Coherence\\
\hline
DRJD & $\num{3.998115}$ & $\num{-196.973950}$\\
\hline
SVTD & $\num{45.869329}$ & $\num{-193.988781}$\\
\hline
TPM & $\num{298.112507}$ & $\num{-198.812939}$\\
\hline
\end{tabular}
\end{center}
\end{table}


# Show top words in topics for different methods

In [17]:
word2id = count_vect.vocabulary_
#print(word2id)
id2word = {value:key for key,value in word2id.items()}
#print(id2word)

In [18]:
print_top_words_table(M_rjd_de, omega_rjd_de,10,4, id2word)

\begin{center}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
god & space & health & file\\
jesus & launch & hiv & gun\\
people & nasa & 1993 & congress\\
atheists & satellite & use & control\\
atheism & edu & medical & firearms\\
does & data & 10 & mr\\
matthew & commercial & number & states\\
religious & satellites & 20 & united\\
just & year & aids & rkba\\
believe & market & april & house\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\end{center}
[['god' 'space' 'health' 'file']
 ['jesus' 'launch' 'hiv' 'gun']
 ['people' 'nasa' '1993' 'congress']
 ['atheists' 'satellite' 'use' 'control']
 ['atheism' 'edu' 'medical' 'firearms']
 ['does' 'data' '10' 'mr']
 ['matthew' 'commercial' 'number' 'states']
 ['religious' 'satellites' '20' 'united']
 ['just'

In [19]:
print_top_words_table(M_svtd, omega_svtd,10,4, id2word)

\begin{center}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
jesus & space & health & file\\
god & launch & hiv & gun\\
atheists & satellite & 1993 & congress\\
atheism & commercial & use & control\\
people & market & medical & firearms\\
matthew & satellites & 10 & mr\\
religious & data & aids & states\\
religion & year & number & united\\
does & nasa & 20 & rkba\\
believe & new & april & house\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\begin{tabular}{|c|c|c|c|c|c|}
\hline
\\
\\
\\
\\
\\
\\
\\
\\
\\
\\
\hline
\end{tabular}

\end{center}
[['jesus' 'space' 'health' 'file']
 ['god' 'launch' 'hiv' 'gun']
 ['atheists' 'satellite' '1993' 'congress']
 ['atheism' 'commercial' 'use' 'control']
 ['people' 'market' 'medical' 'firearms']
 ['matthew' 'satellites' '10' 'mr']
 ['religious' 'data' 'aids' 'states']
 ['religion' 'year' 'number' 'unit