In [1]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from pandasticsearch import Select, DataFrame

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Notebook - Kabana lookalike

In [3]:
from IPython.display import Image
image="https://i0.wp.com/sisypheanhigh.com/foot/wp-content/uploads/2015/08/sisyphus-red.png"
Image(url=image)

In [4]:
hostname="http://noname-sms.us.cray.com:30200"
client =  Elasticsearch(hostname, http_compress=True)

In [5]:
cluster_health=client.cluster.health()
cluster_health

{'cluster_name': 'elasticsearch',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 6,
 'active_shards': 6,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 6,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 50.0}

In [6]:
client_info=client.info()
client

<Elasticsearch([{'host': 'noname-sms.us.cray.com', 'port': 30200}])>

## Read the data from elasticsearch

__Note:__ We use wildcard on the shasta-logs, so we are pulling from _ALL_ data sources

In [57]:
resp = client.search(
    index="shasta-logs-*",
    body={
        "size":0,
        "query": {
            "bool": {
              "must": [{
                  "match_all": {}
                },
                {
                  "range": {
                    "timereported": {
                      "gte": "2019-07-29",
                      "lte": "2019-07-29",
                      "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
                }
              }
            }
          ],
          "must_not": []
        }
      },
        "_source":{
            "excludes":[]
        },
        "aggs": {
        "2": {
          "date_histogram": {
            "field": "timereported",
            "interval": "30s",
            "time_zone": "America/Chicago",
            "min_doc_count": 1
          }
        }
      }
    }
)

In [59]:
print("Number of responses:%d." % resp['hits']['total'])

Number of responses:5942356.


In [60]:
table=resp['aggregations']['2']['buckets']

In [61]:
pd_table=pd.DataFrame.from_dict(table)
pd_table.drop(columns="key", inplace=True)
#pd_table.drop(columns="key_as_string", inplace=True)
pd_table.head(5)

Unnamed: 0,key_as_string,doc_count
0,2019-07-29T03:02:30.000-05:00,6
1,2019-07-29T03:03:00.000-05:00,752
2,2019-07-29T03:03:30.000-05:00,207
3,2019-07-29T03:04:00.000-05:00,27
4,2019-07-29T03:04:30.000-05:00,578


Generate some statistical report on the message counts.

In [62]:
pd_table['doc_count'].describe()

count     1842.000000
mean      3226.034745
std       2407.366949
min          6.000000
25%       1680.250000
50%       2293.500000
75%       4727.250000
max      30955.000000
Name: doc_count, dtype: float64

In [63]:
#pd_table.columns

In [64]:
#pd_table.shape

In [65]:
#pd_table.hist(column='doc_count', color='#86bf91',grid=False,bins=100)

In [66]:
# Bokeh libraries
from bokeh.io import output_file, output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.layouts import row, column, gridplot
from bokeh.models.widgets import Tabs, Panel
from bokeh.models.tools import HoverTool
from bokeh.models import DatetimeTickFormatter
from math import pi

In [67]:
output_notebook()  # Render inline in a Jupyter Notebook

In [68]:
#print(pd_table.head())
pd_table['key_as_string']=pd.to_datetime(pd_table['key_as_string'].astype(str), format='%Y-%m-%dT%H:%M:%S.%f%z')
#print(pd_table.head())

In [69]:
count=pd_table['doc_count'].count()
print("Number of entries: %d." % count)

Number of entries: 1842.


In [70]:
if ( count > 100 ):
   sample = pd_table.sample(100)
else:
   sample = pd_table.sample(count)

X=sample['key_as_string'].tolist()
Y=sample['doc_count'].tolist()

In [85]:
p = figure(x_axis_type="datetime",plot_width=1200, plot_height=400)
source = ColumnDataSource(data={
    'datetime' : X,
    'count' : Y,
})
#p.circle(X, Y, size = 5, color = 'green')
p.circle(x='datetime',y='count', source=source, size = 5, color = 'darkred', alpha=0.3)

p.title.text = 'Message Counts per 30 minutes'
p.background_fill_color="#f5f5f5"
p.grid.grid_line_color="white"
p.yaxis.axis_label = 'Count'
p.xaxis.axis_label =' timereported per 30 minutes'
p.xaxis.formatter=DatetimeTickFormatter(
        hours=["%d %B %Y"],
        days=["%d %B %Y"],
        months=["%d %B %Y"],
        years=["%d %B %Y"],
    )
p.xaxis.major_label_orientation = pi/4

hover = HoverTool(
    tooltips=[
        ('Count', '@count{%d}'),
        ('timereported per 30 seconds','@datetime{%Y-%m-%d %H:%M:%S.%3N}')
    ],
    formatters={
        'count':'printf',
        'datetime': 'datetime',
    },
    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline'
)

p.add_tools(hover)
show(p)

## Lets try a bar or vbar graph

In [89]:
# Set the x_range to the list of categories above
p2 = figure(x_axis_type="datetime",plot_width=1200, plot_height=400, title='Message Counts per 30 minutes')
p2.vbar(x='datetime',top='count', source=source, width=2.1, color = 'darkred', alpha=0.3)
# Set some properties to make the plot look better
p2.xgrid.grid_line_color = None
p2.y_range.start = 0
show(p2)