In [3]:
import numpy as np
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer

### Elasticsearch Client

In [4]:
es = Elasticsearch(['https://sunho:Dunkel6eit!!@i-o-optimized-deployment-84c1c6.es.us-east-1.aws.found.io:9243'], timeout=30)

### Generate Training Data and Labels

In [5]:
def category_name_to_label_map(filepath):
    """ Map category name to index value. """
    label_map = {}
    category_names = os.listdir(filepath)
    for index, category_name in enumerate(category_names):
        label_map[category_name] = index
    return label_map

def create_training_data_and_labels(es, index_name, label_map):
    """ Create training data/labels given the Elasticsearch client. """
    doc_ids, training_data, training_labels = [], [], []
    res = es.search(index=index_name, size=20000)
    hits = res['hits']['hits']
    for hit in tqdm(hits, position=0, desc='creating training data/labels'):
        doc_id = hit['_id']
        text = hit['_source']['text']
        category = hit['_source']['category']
        doc_ids.append(doc_id)
        training_data.append(text)
        training_labels.append(label_map[category])
    return doc_ids, np.array(training_data), training_labels

In [6]:
label_map = category_name_to_label_map('./20NG/20news-bydate-train')
doc_ids, training_data, training_labels = create_training_data_and_labels(es, '20-ng', label_map)

creating training data/labels: 100%|██████████| 15417/15417 [00:00<00:00, 585461.03it/s]


In [7]:
vectorizer = TfidfVectorizer(max_features=10000)
training_vectors = vectorizer.fit_transform(training_data).todense()

### LDA

In [10]:
lda = LatentDirichletAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50, random_state=0)
doc_topics_distribution = lda.fit_transform(training_vectors)

### NMF

In [41]:
nmf = NMF(n_components=20, random_state=1, alpha=.1, l1_ratio=.5)
doc_topics_distribution = nmf.fit_transform(training_vectors)

### Update Topics and Top Words

In [33]:
def update_top_k_words(k, model, vectorizer, doc_ids):
    """ Yield in JSON format consisting of top k words per topic. """
    feature_names = vectorizer.get_feature_names()

    topic_bodies = []
    for topic_id, words_prob in enumerate(model.components_):
        top_words_indices = words_prob.argsort()[::-1][:k]
        top_word_probs = [words_prob[index] for index in top_words_indices]
        top_words = [feature_names[index] for index in top_words_indices]

        word_bodies = []
        for index in range(len(top_words_indices)):
            word_bodies.append({
                "word": top_words[index],
                "probability": top_word_probs[index]
            })

        topic_body = {
            "topic_id": topic_id,
            "top_words": word_bodies,
        }
        topic_bodies.append(topic_body)

    for doc_id in tqdm(doc_ids, position=0, desc='bulk update top 10 words per topic'):
        yield {
            '_index': '20-ng',
            '_op_type': 'update',
            '_id': doc_id,
            'doc': {
                'topic_index': '20-ng-topics',
                'topics': topic_bodies
            }
        }


def update_top_k_topics_per_doc(k, distribution, doc_ids):
    """ Yield in JSON format consisting of top k topics per document. """
    doc_index = 0
    for distribution in tqdm(doc_topics_distribution, position=0, desc='bulk update top topics per document'):
        top_topics_indices = distribution.argsort()[::-1][:k]
        top_topics_probabilities = [distribution[index] for index in top_topics_indices]

        topic_bodies = []
        for index in range(len(top_topics_indices)):
            topic_bodies.append({
                "topic": str(top_topics_indices[index]),
                "probability": top_topics_probabilities[index]
            })

        doc_id = doc_ids[doc_index]
        doc_index += 1

        yield {
            '_index': '20-ng',
            '_op_type': 'update',
            '_id': doc_id,
            'doc': {
                'doc_topics': topic_bodies
            }
        }

In [35]:
bulk(es, update_top_k_words(k=10, model=lda, vectorizer=vectorizer, doc_ids=doc_ids))

bulk update top 10 words per topic: 100%|██████████| 15417/15417 [02:12<00:00, 116.19it/s]


(15417, [])

In [43]:
bulk(es, update_top_k_topics_per_doc(k=5, distribution=doc_topics_distribution, doc_ids=doc_ids))

bulk update top topics per document: 100%|██████████| 15417/15417 [00:58<00:00, 263.72it/s]


(15417, [])

In [9]:
res = es.search(index='duc-2001', size=20000)
hits = res['hits']['hits']
for hit in hits:
    example_hit = hit
    break

creating training data/labels:   0%|          | 0/308 [00:00<?, ?it/s]


In [15]:
from pprint import pprint
pprint(example_hit['_source'])

ic_index': 'duc-2001-topics',
 'topics': [{'top_words': [{'probability': 2.0324799894381815,
                            'word': 'police'},
                           {'probability': 1.633573858257123,
                            'word': 'cellrule'},
                           {'probability': 1.3244433766884554,
                            'word': 'tablecell'},
                           {'probability': 1.0827756735703482, 'word': 'gates'},
                           {'probability': 1.0619211225445406, 'word': 'said'},
                           {'probability': 0.9944573003136186,
                            'word': 'brutality'},
                           {'probability': 0.9679122871877702,
                            'word': 'officers'},
                           {'probability': 0.9435435501434049,
                            'word': 'rescue'},
                           {'probability': 0.9353218554530526,
                            'word': 'commission'},
                          