## The purpose of this notebook is to test the preliminary LDA model trained from HydroShare public data to see if the model can be used for HydroShare keyword recommendation

In [1]:
# install the needed modules
!pip install gensim
!pip install nltk
!pip install pyLDAvis
!pip install ipython_blocking

In [2]:
# import the needed modules
import re
from gensim.models import Phrases
from gensim.models.ldamodel import LdaModel
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
from nltk import pos_tag
import ipywidgets as widgets
from IPython.display import display
from hs_restclient import HydroShare, HydroShareAuthBasic
import pyLDAvis.gensim
import ipython_blocking
import warnings
warnings.filterwarnings('ignore')

In [3]:
lemmatizer = WordNetLemmatizer()

# add stop words including custom stop words to remove from the input text
stop_words = stopwords.words('english')
custom_stop_words = ['include', 'included', 'includes', 'including', 'contain', 'contains', 'containing', 'file', 'et', 'al', 'different', 
                    'dat', 'edu', 'not', 'would', 'say', 'could', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 
                     'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'from', 'published', 
                     'run', 'need', 'even', 'right', 'line', 'also', 'may', 'take', 'come', 'used', 'using', 'use', 'public', 
                     'private', 'new', 'west', 'east', 'south', 'north', 'approximately', 'four', 'low', 'high', 'available', 'mile', 'daily', 
                     'year', 'month', 'variable', 'square', 'city', 'foot', 'area', 'within', 'provide', 'firm', 'file', 'state', 'information', 
                     'well', 'university']
stop_words.extend(custom_stop_words)

In [4]:
# load the trained LDA model
model_path = "model/hs_lda_model"
lda = LdaModel.load(model_path)

In [5]:
def process_text(text):
    # Make all the strings lowercase
    text = text.lower()
    # remove urls
    text = re.sub('http[s]?://\S+', '', text)
    # remove non alphabetic characters except dash symbol which is converted to underbar that indicates phrases
    text = re.sub('[-]', '_', text)
    text = re.sub('[^A-Za-z_]', ' ', text.lower())

    # Tokenize the text; this is, separate every sentence into a list of words
    # Since the text is already split into sentences you don't have to call sent_tokenize
    tokenized_text = word_tokenize(text)
    
    tag_map = defaultdict(lambda : wn.NOUN)    
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV    
    
    # Remove the stopwords and lemmatize each word
    clean_text = [
        lemmatizer.lemmatize(word[1:] if word.startswith('_') else word, tag_map[tag[0]]) for word, tag in pos_tag(tokenized_text)
        if word not in stop_words and len(word)>2
    ]
    # this final output is a list of words - remove stop words from lemmatized text to completely remove stop words
    return [word for word in clean_text if word not in stop_words]

### Input any abstract text for testing the model prediction of keywords. Note that the model is trained with HydroShare public data, so the scope of keyword prediction is limited to the content covered by the training data, and as more HydroShare public data become available, the model will be updated regularly with new data incorporated in training.

In [6]:
abstract_input = widgets.Textarea(
    placeholder="Input the abstract to extract keywords from", 
    disabled=False)
display(abstract_input)
#print(dir(abstract_input))
def abstract_input_callback(widget):
    global abstract
    abstract = abstract_input.value
    return
abstract_input.on_trait_change(abstract_input_callback)
button = widgets.Button(description="Continue to run the rest of cells below")
display(button)

<IPython.core.display.Javascript object>

In [9]:
%blockrun button

In [10]:
# process the input text
print("Input abstract is:", abstract)
doc = process_text(abstract)
# Add bigrams and trigrams to texts
# higher threshold means few phrases, heavily depends on scoring function, which can be set as 'default' or 'npmi'
bigram = Phrases(doc, min_count=1, threshold=1, scoring='default') 
trigram = Phrases(bigram[doc])

for token in bigram[doc]:
    if '_' in token and token not in doc:
        # Token is a bigram, add to document.
        doc.append(token)
        # print("in bigram processing: ", token)
for token in trigram[doc]:
    if '_' in token and token not in doc:
        # Token is a bigram, add to document.
        doc.append(token)
        # print("in trigram processing: ", token)

Input abstract is: The data set contains the monthly statistics for the APCPsfc variable (precipitation total) of the North American Land Data Assimilation System Version 2 (NLDAS-2) model. The period of analysis is from 1979-01-02 to 2013-12-31. The statistics for each calendar month are the mean, standard deviation, minimum, maximum, and percentiles in 0.05 interval. The data set also includes a p-value per calendar month of the Kolmogorov-Smirnov (KS) test. The p-value of the KS test shows if the computed empirical cumulative distribution function (CDF) comes from a fitted gamma distribution


### Use the model to predict topics of the input abstract text, only output the most likely topic and its keyword representation

In [11]:
# use the model to predict topics of the input abstract text, only output the most likely topic and its keyword representation
test_bows = [lda.id2word.doc2bow(doc)]
for i, row_list in enumerate(lda[test_bows]):
    row = row_list[0] if lda.per_word_topics else row_list
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:
            wp = lda.show_topic(topic_num)
            topic_keywords_list = [word for word, prop in wp]
            topic_keywords = "; ".join(topic_keywords_list)
            print("propbability of topic: ", prop_topic)
            print(topic_keywords)            
        else:
            break

propbability of topic:  0.987118766366
model; data; test; input; simulation; climate; output; landlab; resource; water


### Update the model by inrementally incorporating the new data and then recommend keywords

In [12]:
# update the model by inrementally incorporating the new data and then recommend keywords
lda.update(test_bows)
for i, row_list in enumerate(lda[test_bows]):
    row = row_list[0] if lda.per_word_topics else row_list
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:
            wp = lda.show_topic(topic_num)
            topic_keywords_list = [word for word, prop in wp]
            topic_keywords = "; ".join(topic_keywords_list)
            print("After model update")
            print("propbability of topic: ", prop_topic)
            print(topic_keywords)            
        else:
            break

After model update
propbability of topic:  0.987954944261
data; model; test; set; distribution; statistic; calendar; p_value; analysis; period


### Explore the model with original training data visually

In [13]:
import pickle
with open("model/training_corpus.dat", "rb") as fp:
    corpus = pickle.load(fp)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary=lda.id2word, sort_topics=False)
vis

### Explore the model with test abstract data visually

In [14]:
vis = pyLDAvis.gensim.prepare(lda, test_bows, dictionary=lda.id2word, sort_topics=False)
vis

## Input your feedback to have them captured and added to the HydroShare resource

In [15]:
# login to HydroShare using your username and password
username = input("Please input your HydroShare user name followed by enter key: ")

Please input your HydroShare user name: hongyi


In [16]:
import getpass
print("Please input your password followed by enter key:")
password = getpass.getpass()

········


In [17]:
auth = HydroShareAuthBasic(username=username, password=password)
hs = HydroShare(auth=auth)

In [18]:
# input your feedback 
feedback_input = widgets.Textarea(
    placeholder="Input your feedback comment which will be captured and added to the HydroShare resource", 
    disabled=False)
display(feedback_input)

feedback_file_name = "feedback_{}.txt".format(username)
#print(dir(abstract_input))
def feedback_input_callback(widget):
    global feedback_text
    feedback_text = feedback_input.value
    return
feedback_input.on_trait_change(feedback_input_callback)
cont_button = widgets.Button(description='Continue to run the rest of cells below')
display(cont_button)

<IPython.core.display.Javascript object>

In [21]:
%blockrun cont_button

In [22]:
# write user feedback to file
f = open(feedback_file_name, "w")
f.write(feedback_text)
f.close()


In [24]:
# add feedback into feedback resource
feedback_res_id = "b0abfd43ba1949ef8b69bead62785788"
hs.addResourceFile(feedback_res_id, feedback_file_name)

{'file_name': 'feedback_hongyi.txt',
 'file_path': 'feedback_hongyi.txt',
 'resource_id': 'b0abfd43ba1949ef8b69bead62785788'}