In [483]:
%matplotlib inline
import pandas as pd
import numpy as np
import gensim

from gensim import models, corpora
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import Phrases

from nltk import RegexpTokenizer
from nltk.corpus import stopwords, inaugural
from nltk.stem import PorterStemmer, WordNetLemmatizer

import warnings
warnings.filterwarnings('ignore')

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
pyLDAvis.enable_notebook()

from pymongo import MongoClient
import json

In [446]:
#Connect to db and authenticate
def connet_mongoclient(host):
    client = MongoClient('localhost', 27017)
    db = client.test
def add_df_to_mongo(db, df):
    df.to_json(orient='index')
    records = json.loads(df.T.to_json()).values()
    db.topic_model.insert_many(records)
def drop_collection():
    db.collection.drop()

In [480]:
# Load corpora and model
def load_lda_model(model_name):
    corpus = corpora.MmCorpus('papers.mm')
    dictionary = Dictionary.load('papers.dict')
    lda_model = gensim.models.LdaModel.load(model_name)

load_lda_model('lda_topic20.model')

In [485]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis,'lda_topic20.html')

In [486]:
pyLDAvis.display(vis)

The 20 topics are defined as:
    - 1. Theorems/Proofs
    - 2. Optimization
    - 3. Graphs
    - 4. Computer Vision
    - 5. Regularization/Sparsity
    - 6. Probability Distributions
    - 7. Supervised Learning
    - 8. Reinforcement Learning
    - 9. Text Processing
    - 10. Neural Networks
    - 11. Dimension Reduction/Metric Learnin
    - 12. Topic Models
    - 13. Human Cognition
    - 14. Neurobiology
    - 15. Deep Learning
    - 16. Human Visual System
    - 17. Clustering
    - 18. Adversarial Settings/Bandits
    - 19. Memory/Recurrent Networks
    - 20. Speech Processing


In [357]:
# Calculate Top-3 topics from topics distribution
topics = 20
doc_number = 6560
distributions = []

documentId = [str(i) for i in range(1, 6561)]
columns = ['graphs', 'human_visual_system', 'deep_learning', 'theorems_and_proofs', 'adversarial_settings_and_bandits',
           'regularization_and_sparsity', 'neurobiology', 'neural_networks', 'supervised_learning', 'probability_distributions', 
           'speech_processing', 'clustering', 'human_cognition', 'dimension_reduction_and_metric_learning', 'reinforcement_learning',
           'optimization', 'computer_vision', 'topic_models', 'memory_and_recurrent_networks', 'text_processing',
           'main_topic', 'second_topic', 'third_topic']

# Append distributions to DataFrame
df2 = pd.DataFrame(lda_model[corpus[0]], columns=['topic', 'doc'+str(0+1)])
for j in range(1, doc_number):
    df1 = pd.DataFrame(lda_model[corpus[j]], columns=['topic', 'doc'+str(j+1)])
    df2 = pd.merge(df2, df1, on='topic', how='outer')    
    df2 = df2.fillna(0)
df2.sort_values(by=['topic'], ascending=[True])
tmp = df2.values.reshape(6561, topics) 

# Append Top-3 topics
df2 = df2.sort_values(by=['topic'], ascending=[True])
df3 = df2.set_index([columns[0:20]])
df3.drop(['topic'], axis = 1, inplace = True)
tmp = df3
for i in range(0, 3):
    topic_main = df3.iloc[:20,:].apply(lambda r: r.index[r.argsort()[::-1][i]], axis = 0)
    df4 = tmp.append(topic_main.T, ignore_index=True)
    tmp = df4

dis = pd.DataFrame(df4.T)
dis.columns = columns
df.reset_index(drop=True, inplace=True)
dis.reset_index(drop=True, inplace=True)
df = pd.DataFrame(documentId, columns=['documentId'])
df = pd.concat([df, dis], axis=1)

# Insert dataframe to Mongodb
add_df_to_mongo(db, df)

In [487]:
# Insert topic and keywords proportion to Mongodb
topic = []
topics = 20
num_words= 30

for topics in range(0, topics):
    keywords = pd.DataFrame(lda_model.show_topic(topicid=topics, topn=num_words))
    for words in range(0, num_words):
        topic = columns[topics]
        word = keywords[0].iloc[words] 
        prop = keywords[1].iloc[words] 
        db.topic_model.insert_one({'topic': topic,
                                   'keywords': word,
                                   'prop': prop})

In [295]:
# Query topic_distribution collection
db.topic_dist.find( { "documentId": "356" } )
db.topic_dist.find( { "main_topic": "reinforcement_learning" } )
db.topic_dist.aggregate([{$group : {_id : "$main_topic", documentId : {$sum : 1}}}])

# Query topic_keywords collection
db.topic_keywords.find( { "keywords": "bound" } ).sort( {prop: -1} )

<pymongo.cursor.Cursor object at 0x111d13f50>
