In [11]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from pprint import pprint

import pandas as pd


%run NLP_clustering.ipynb

def evaluate_bar_graph(coherences, indices):
    """
    Function to plot bar graph.
    
    coherences: list of coherence values
    indices: Indices to be used to mark bars. Length of this and coherences should be equal.
    """
    assert len(coherences) == len(indices)
    n = len(coherences)
    x = np.arange(n)
    plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')
    plt.xlabel('Models')
    plt.ylabel('Coherence Value')

def ret_top_model():
    """
    Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the
    quality of the topic model we produce, we can see what the interpretability of the best topic is and keep
    evaluating the topic model until this threshold is crossed. 
    
    Returns:
    -------
    lm: Final evaluated topic model
    top_topics: ranked topics in decreasing order. List of tuples
    """
    top_topics = [(0, 0)]
    while top_topics[0][1] < 0.97:
        lm = LdaModel(corpus=corpus, id2word=dictionary)
        coherence_values = {}
        for n, topic in lm.show_topics(num_topics=-1, formatted=False):
            topic = [word for word, _ in topic]
            cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary, window_size=10)
            coherence_values[n] = cm.get_coherence()
        top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)
    return lm, top_topics


def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=-1, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = gensim.models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = gensim.models.CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v


if __name__ == "__main__":

    csw = CatalogueServiceWeb('http://geocatalog.webservice-energy.org/geonetwork/srv/eng/csw')
    set_title = fes.PropertyIsLike('any', '')#SEARCH_QUERY)
    filter_list = [set_title]

    csw.getrecords2(constraints=filter_list, maxrecords=2000)

    fmt = '{:*^64}'.format
    print(fmt(' Catalog information '))
    print("CSW version: {}".format(csw.version))
    print("Number of datasets available: {}".format(len(csw.records.keys())))
    print('\n')

    original_list_of_titles = []
    preprocessed_list_of_titles = []
    identifiers = []
    word2vec_number_list = []
    
    for rec in csw.records:
        original_list_of_titles.append(csw.records[rec].title)
        identifiers.append(csw.records[rec].identifier)
        title = prepareDescription(csw.records[rec].title, keepwords)
        if csw.records[rec].abstract != '':
            abstract = prepareDescription(csw.records[rec].abstract, keepwords)
            title = title + " " + abstract
        
        #sent = avg_feature_vector(keyw, model, num_features=NUM_FEATURES, index2word_set=index2word_set)
        #if len(title) != 0:
        preprocessed_list_of_titles.append(title.split())
        #word2vec_number_list.append(sent)

    nlp = spacy.load('en', disable=['parser', 'ner'])
    preprocessed_list_of_titles = lemmatization(preprocessed_list_of_titles, nlp)
    dictionary = Dictionary(preprocessed_list_of_titles)
    
    corpus = [dictionary.doc2bow(doc) for doc in preprocessed_list_of_titles]
    
    lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
    lsitopics = lsimodel.show_topics(formatted=False)
    print("------------------------------LSI----------------------------------")
    print(lsimodel.show_topics(10))
    print("-------------------------------------------------------------------")
    
    hdpmodel = HdpModel(corpus=corpus, id2word=preprocessed_list_of_titles)
    hdptopics = hdpmodel.show_topics(formatted=False)
    data_frame = topic_prob_extractor(hdpmodel)
    data_frame = data_frame.sort_values(by='weight', ascending=False)
    print("------------------------------HDP----------------------------------")
    print(data_frame)
    print("-------------------------------------------------------------------")
    
    train_texts = preprocessed_list_of_titles

    lm, top_topics = ret_top_model()
    print("------------------------------LDA----------------------------------")
    print(top_topics[:5])
    print("-------------------------------------------------------------------")
    
    lmlist, c_v = evaluate_graph(dictionary=dictionary, corpus=corpus, texts=train_texts, limit=10)
    lmtopics = lmlist[5].show_topics(formatted=False)
    ldatopics = ldamodel.show_topics(formatted=False)
    
    
    lda_lsi_topics = [[word for word, prob in lm.show_topic(topicid)] for topicid, c_v in top_topics]
    
    lsitopics = [[word for word, prob in topic] for topicid, topic in lsitopics]

    hdptopics = [[word for word, prob in topic] for topicid, topic in hdptopics]

    ldatopics = [[word for word, prob in topic] for topicid, topic in ldatopics]

    lmtopics = [[word for word, prob in topic] for topicid, topic in lmtopics]

    
    lsi_coherence = CoherenceModel(topics=lsitopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    hdp_coherence = CoherenceModel(topics=hdptopics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lda_coherence = CoherenceModel(topics=ldatopics, texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lm_coherence = CoherenceModel(topics=lmtopics, texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()

    lda_lsi_coherence = CoherenceModel(topics=lda_lsi_topics[:10], texts=train_texts, dictionary=dictionary, window_size=10).get_coherence()
    
    evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence, lm_coherence, lda_lsi_coherence],
                   ['LSI', 'HDP', 'LDA', 'LDA_Mod', 'LDA_LSI'])

********************* Catalog information **********************
CSW version: 2.0.2
Number of datasets available: 1690


------------------------------LSI----------------------------------
[(0, '0.270*"solar" + 0.215*"monthly" + 0.190*"institute" + 0.157*"dni" + 0.146*"energy" + 0.137*"model" + 0.135*"daily" + 0.134*"ghi" + 0.133*"horizontal" + 0.133*"available"'), (1, '0.205*"average" + 0.182*"university" + -0.123*"institute" + 0.121*"provide" + 0.120*"mean" + 0.118*"information" + 0.117*"service" + 0.117*"atmosphere" + 0.116*"value" + 0.115*"compute"'), (2, '-0.254*"pacific" + -0.180*"resource" + -0.163*"state" + -0.152*"technical" + -0.149*"renewable" + -0.147*"energy" + -0.147*"development" + -0.147*"project" + -0.142*"region" + -0.140*"territory"'), (3, '0.370*"wind" + 0.225*"resolution" + 0.181*"average" + 0.178*"speed" + 0.171*"simulate" + 0.154*"level" + 0.134*"term" + -0.119*"pacific" + 0.116*"year" + 0.114*"simulation"'), (4, '-0.248*"simulate" + 0.190*"spatial" + 0.165*"desc



------------------------------HDP----------------------------------
     topic_id    weight
1           1  0.228919
3           3  0.164361
0           0  0.163114
4           4  0.107357
2           2  0.084969
112       112  0.071756
49         49  0.071637
145       145  0.071298
8           8  0.071125
123       123  0.069582
127       127  0.069560
42         42  0.069547
65         65  0.069322
77         77  0.068929
108       108  0.068885
101       101  0.068661
120       120  0.068609
94         94  0.068577
26         26  0.068296
125       125  0.068211
82         82  0.068201
27         27  0.068092
52         52  0.067915
11         11  0.067680
99         99  0.067669
98         98  0.067553
140       140  0.067549
138       138  0.067464
128       128  0.067157
146       146  0.066935
..        ...       ...
7           7  0.060343
111       111  0.060337
23         23  0.060332
43         43  0.060286
41         41  0.060255
46         46  0.060165
64         64  0.060

































































































































































































































































































































































































Traceback (most recent call last):
  File "/home/iulia/anaconda3/envs/armines-pilot/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/iulia/anaconda3/envs/armines-pilot/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/iulia/anaconda3/envs/armines-pilot/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/iulia/anaconda3/envs/armines-pilot/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Process AccumulatingWorker-7279196:
Process AccumulatingWorker-7279199:
Traceback (most recent call last):
Traceback (most recent call last):
Process AccumulatingWorker-7279197:
Traceback (most recent call last):
  File "/home/iulia/anaconda3/envs/armines-pilot/lib/python3.6/multiprocessing/process.py", line 261, in _bootstrap
    u

KeyboardInterrupt: 