In [1]:
import numpy as np
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer

### Elasticsearch Client

In [2]:
es = Elasticsearch(['https://sunho:Dunkel6eit!!@i-o-optimized-deployment-84c1c6.es.us-east-1.aws.found.io:9243'], timeout=30)

### Generate Training Data and Labels

In [3]:
def create_training_data_and_labels(es, index_name):
    """ Create training data/labels given the Elasticsearch client. """
    doc_ids, training_data, training_labels = [], [], []
    res = es.search(index=index_name, size=20000)
    hits = res['hits']['hits']
    for hit in tqdm(hits, position=0, desc='creating training data/labels'):
        doc_id = hit['_id']
        text = hit['_source']['text']
        doc_ids.append(doc_id)
        training_data.append(text)
    return doc_ids, np.array(training_data)

def create_and_train_tokenizer(train_file_ids):
    text = ""
    for file_id in train_file_ids:
        text += gutenberg.raw(file_id)

    trainer = PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    trainer.train(text)
    
    tokenizer = PunktSentenceTokenizer(trainer.get_params())
    return tokenizer

In [4]:
doc_ids, training_data = create_training_data_and_labels(es, 'duc-2001')

creating training data/labels: 100%|██████████| 308/308 [00:00<00:00, 164002.24it/s]


In [5]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
training_vectors = vectorizer.fit_transform(training_data).todense()

### LDA

In [6]:
lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50, random_state=0)
doc_topics_distribution = lda.fit_transform(training_vectors)

### NMF

In [7]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5)
doc_topics_distribution = nmf.fit_transform(training_vectors)

### Update Topics and Top Words

In [8]:
def update_top_k_words(k, model, vectorizer, doc_ids):
    """ Yield in JSON format consisting of top k words per topic. """
    feature_names = vectorizer.get_feature_names()

    topic_bodies = []
    for topic_id, words_prob in enumerate(model.components_):
        top_words_indices = words_prob.argsort()[::-1][:k]
        top_word_probs = [words_prob[index] for index in top_words_indices]
        top_words = [feature_names[index] for index in top_words_indices]

        word_bodies = []
        for index in range(len(top_words_indices)):
            word_bodies.append({
                "word": top_words[index],
                "probability": top_word_probs[index]
            })

        topic_body = {
            "topic_id": topic_id,
            "top_words": word_bodies,
        }
        topic_bodies.append(topic_body)

    for doc_id in tqdm(doc_ids, position=0, desc='bulk update top 10 words per topic'):
        yield {
            '_index': 'duc-2001',
            '_op_type': 'update',
            '_id': doc_id,
            'doc': {
                'topic_index': 'duc-2001-topics',
                'topics': topic_bodies
            }
        }


def update_top_k_topics_per_doc(k, distribution, doc_ids):
    """ Yield in JSON format consisting of top k topics per document. """
    doc_index = 0
    for distribution in tqdm(doc_topics_distribution, position=0, desc='bulk update top topics per document'):
        top_topics_indices = distribution.argsort()[::-1][:k]
        top_topics_probabilities = [distribution[index] for index in top_topics_indices]

        topic_bodies = []
        for index in range(len(top_topics_indices)):
            topic_bodies.append({
                "topic": str(top_topics_indices[index]),
                "probability": top_topics_probabilities[index]
            })

        doc_id = doc_ids[doc_index]
        doc_index += 1

        yield {
            '_index': 'duc-2001',
            '_op_type': 'update',
            '_id': doc_id,
            'doc': {
                'doc_topics': topic_bodies
            }
        }

In [9]:
bulk(es, update_top_k_words(k=10, model=lda, vectorizer=vectorizer, doc_ids=doc_ids))

bulk update top 10 words per topic: 100%|██████████| 308/308 [00:00<00:00, 2457.30it/s]


(308, [])

In [10]:
bulk(es, update_top_k_topics_per_doc(k=5, distribution=doc_topics_distribution, doc_ids=doc_ids))

bulk update top topics per document: 100%|██████████| 308/308 [00:00<00:00, 13597.09it/s]


(308, [])