In [None]:
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda
import pickle
import re
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## Connect to ES & Query

In [None]:
es = Elasticsearch(['elasticsearch:9200'])

In [None]:
try:
    fluentdIndex = list(es.indices.get_alias("fluentd-*").keys())[0]
except:
    print("Please, run any container with Fluentd as the log driver first")

In [None]:
n = 1000
body = {
  "query": {
    "match_all": {}
  },
  "size": n,
  "sort": [
    {
      "@timestamp": {
        "order": "desc"
      }
    }
  ]
}


response = es.search(index=fluentdIndex, body=body)

In [None]:
fields = {}
for num, doc in enumerate(response["hits"]["hits"]):
    source_data = doc["_source"]
    for key, val in source_data.items():
        try:
            fields[key] = np.append(fields[key], val)
        except KeyError:
            fields[key] = np.array([val])

In [None]:
elastic_df = pd.DataFrame(fields)
elastic_df.head()

## Data Cleaning

In [None]:
elastic_df = elastic_df.drop(columns=['container_id', '@log_name'])

In [None]:
elastic_df["processed_log"] = elastic_df['log'].map(lambda x: re.sub('[-=:/,\.!?]', ' ', str(x)))

In [None]:
elastic_df["processed_log"] = elastic_df["processed_log"].map(lambda x: x.lower())

In [None]:
elastic_df["processed_log"]

## Exploratory Data Analysis

In [None]:
long_string = ','.join(list(elastic_df["processed_log"].values))

wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()

In [None]:
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(elastic_df["processed_log"])

# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

## Latent Dirichlet Allocation

In [None]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

Tweak the two parameters below

In [None]:
number_topics = 5
number_words = 10

Create and fit the LDA model

In [None]:
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
log_vis_data = sklearn_lda.prepare(lda, count_data, count_vectorizer)

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.display(log_vis_data)

In [None]:
log_tsne = sklearn_lda.prepare(lda, count_data, count_vectorizer, mds='tsne')

In [None]:
pyLDAvis.display(log_tsne)