In [None]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

import pandas as pd


%run NLP_clustering.ipynb

def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

def ret_top_model():
    """
    Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
    quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
    evaluating the topic model until this threshold is crossed. 
    
    Returns:
    -------
    lm: Final evaluated topic model
    top_topics: ranked topics in decreasing order. List of tuples
    """
    top_topics = [(0, 0)]
    while top_topics[0][1] < 0.97:
        lm = LdaModel(corpus=corpus, id2word=dictionary)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
    return lm, top_topics


def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=-1, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = gensim.models.CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v


if __name__ == "__main__":

    csw = CatalogueServiceWeb('http://geocatalog.webservice-energy.org/geonetwork/srv/eng/csw')
    set_title = fes.PropertyIsLike('any', '')#SEARCH_QUERY)
    filter_list = [set_title]

    csw.getrecords2(constraints=filter_list, maxrecords=2000)

    fmt = '{:*^64}'.format
    print(fmt(' Catalog information '))
    print("CSW version: {}".format(csw.version))
    print("Number of datasets available: {}".format(len(csw.records.keys())))
    print('\n')

    original_list_of_titles = []
    preprocessed_list_of_titles = []
    identifiers = []
    word2vec_number_list = []
    
    for rec in csw.records:
        original_list_of_titles.append(csw.records[rec].title)
        identifiers.append(csw.records[rec].identifier)
        title = prepareDescription(csw.records[rec].title, keepwords)
        if csw.records[rec].abstract != '':
            abstract = prepareDescription(csw.records[rec].abstract, keepwords)
            title = title + " " + abstract
        
        #sent = avg_feature_vector(keyw, model, num_features=NUM_FEATURES, index2word_set=index2word_set)
        #if len(title) != 0:
        preprocessed_list_of_titles.append(title.split())
        #word2vec_number_list.append(sent)

    nlp = spacy.load('en', disable=['parser', 'ner'])
    preprocessed_list_of_titles = lemmatization(preprocessed_list_of_titles, nlp)
    dictionary = Dictionary(preprocessed_list_of_titles)
    
    corpus = [dictionary.doc2bow(doc) for doc in preprocessed_list_of_titles]
    
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
    lsitopics = lsimodel.show_topics(formatted=False)
    print("------------------------------LSI----------------------------------")
    print(lsimodel.show_topics(10))
    print("-------------------------------------------------------------------")
    
    hdpmodel = HdpModel(corpus=corpus, id2word=preprocessed_list_of_titles)
    hdptopics = hdpmodel.show_topics(formatted=False)
    data_frame = topic_prob_extractor(hdpmodel)
    data_frame = data_frame.sort_values(by='weight', ascending=False)
    print("------------------------------HDP----------------------------------")
    print(data_frame)
    print("-------------------------------------------------------------------")
    
    train_texts = preprocessed_list_of_titles

    lm, top_topics = ret_top_model()
    print("------------------------------LDA----------------------------------")
    print(top_topics[:5])
    print("-------------------------------------------------------------------")
    
    lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=train_texts, limit=10)
    lmtopics = lmlist[5].show_topics(formatted=False)
    ldatopics = ldamodel.show_topics(formatted=False)
    
    
    lda_lsi_topics = [[word for word, prob in lm.show_topic(topicid)] for topicid, c_v in top_topics]
    
    lsitopics = [[word for word, prob in topic] for topicid, topic in lsitopics]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdptopics]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldatopics]

    lmtopics = [[word for word, prob in topic] for topicid, topic in lmtopics]

    
    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lm_coherence = CoherenceModel(topics=lmtopics, texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lda_lsi_coherence = CoherenceModel(topics=lda_lsi_topics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()
    
    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence, lm_coherence, lda_lsi_coherence],
                   ['LSI', 'HDP', 'LDA', 'LDA_Mod', 'LDA_LSI'])