In [26]:
import warnings

In [27]:
from gensim import corpora, models, similarities
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim
import os
import pickle
import pandas as pd
import numpy as np

pyLDAvis.enable_notebook()

def initialLDAModel(folder_path, non_german_file_path, topic_num=50, document_limit=1000):
    #get a list of non_german files
    non_german_files = []
    with open(non_german_file_path, 'r') as fr:
        for line in fr:
            non_german_files.append(line.strip())
    
    dictionary = []
    filenames = []
    for root, dirs, files in os.walk(folder_path):
        for f in files:
            if f[:-11] not in non_german_files:
                document_limit -= 1
                try:
                    with open(root+'/'+f, 'rb') as fr:
                        filenames.append(f)
                        document_tokens = pickle.load(fr)
                        dictionary.append(document_tokens)
                except:
                    print('Error while processing: ', f)
            
            if document_limit == 0:
                break
    gensim_dictionary = Dictionary(dictionary)
    corpus = [gensim_dictionary.doc2bow(text) for text in dictionary]
    lda = LdaModel(corpus, num_topics=topic_num, id2word=gensim_dictionary, iterations=200)
    topics = lda.show_topics(num_topics=-1, num_words=20)
    
    doc_pos = 0
    mat = np.zeros((len(filenames), topic_num))

    for doc in corpus:
        vector = lda[doc] # get topic probability distribution for a document
        for element in vector:
            mat[doc_pos][element[0]] = element[1]
        doc_pos += 1
        
    df = pd.DataFrame(mat, index=filenames, columns=range(0,topic_num))
    return df

In [3]:
# %%time
# warnings.filterwarnings("ignore") 
# non_german_file_path = './LabShare/data/non_german_files.txt'
# processed_doc_path = './LabShare/data/chui_ma/spacy_corpus'
# df = initialLDAModel(processed_doc_path, non_german_file_path, 100, 9999)

In [28]:
%%time
warnings.filterwarnings("ignore") 
non_german_file_path = './LabShare/data/non_german_files.txt'
data_path = '/home/bit/ma0/LabShare/data/chui_ma/presentation_LDA'
df = initialLDAModel(data_path, non_german_file_path, 20)

CPU times: user 1min 22s, sys: 3.2 s, total: 1min 26s
Wall time: 15.4 s


In [29]:
def get_Topic_Related_Doc(df):    
    topic_related_documents = []
    for column in range(df.shape[1]):
        row_count = 0
        for row in range(df.shape[0]):
            if df.iat[row, column] > 0.3:
                row_count += 1
        topic_related_documents.append(df.nlargest(row_count, column).index)
    return topic_related_documents

In [30]:
def hierachicalLDA(folder_path, non_german_file_path, topic_related_documents, topic_num=5, document_limit=1000):
    #get a list of non_german files
    non_german_files = []
    with open(non_german_file_path, 'r') as fr:
        for line in fr:
            non_german_files.append(line.strip())
            
    dictionary = []
    filenames = []
    for root, dirs, files in os.walk(folder_path):
        for f in sorted(files):
            if f in topic_related_documents:
                if f[:-11] not in non_german_files:
                    with open(root+'/'+f, 'rb') as fr:
                        filenames.append(f)
                        document_tokens = pickle.load(fr)
                        dictionary.append(document_tokens)

    gensim_dictionary = Dictionary(dictionary)
    corpus = [gensim_dictionary.doc2bow(text) for text in dictionary]
    lda = models.ldamodel.LdaModel(corpus, num_topics=topic_num, id2word=gensim_dictionary, iterations=200)
    topics = lda.show_topics(num_topics=-1, num_words=20)
    return lda, corpus, gensim_dictionary

In [31]:
docs = get_Topic_Related_Doc(df)

In [21]:
def visualization(data_path, docs, topic_num):
    if len(docs) > 5:
        lda, corpus, dictionary = hierachicalLDA(data_path, non_german_file_path, docs, topic_num)
        return pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    else:
        print('Cannot visualize, Too less documents')

In [38]:
# %%time
# warnings.filterwarnings("ignore")
# visualization(docs[0].tolist())

In [37]:
# visualization(data_path, docs[0].tolist(), 3)

In [36]:
# docs[0].tolist()