# Analysis of Google app for work Log Tutorial (2)
> Using low level Elasticsearch python agent to connect to repository

> Using Bokeh (http://bokeh.pydata.org/en/latest/) to visualize the result

# Step 1: Connect to repo via low level ES Python agent

In [4]:
from elasticsearch import Elasticsearch

import numpy as np
import os  # for os.path.basename
import json

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import MDS
from sklearn.cluster import KMeans

import pandas as pd

from bokeh.charts import Scatter, output_file, show
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource

es_address='114.32.24.166'
client = Elasticsearch(es_address)

response = client.search(
    index="google",
    body={
        "query":{
            "bool":{
                "must_not":[{"term":{"actor_email":""}}],
                #"must":[{"terms":{"event_type":["acl_change"]}}]
                "must":[{"terms":{"event_type":["access"]}}]
                #"must":[{"terms":{"event_type":["access"]}},{"terms":{"event_name":["upload","download"]}}]
                #"must":[{"terms":{"event_type":["access"]}},{"terms":{"event_name":["edit","rename","preview","view","move","copy"]}}]
            }
        },
        "aggs":{
            "actor_name":{
                "terms":{
                 "field":"actor_email",
                 "size":100
                },
                "aggs":{    
                    "event_name":{
                        "terms":{
                            "field":"event_name",
                   "size":100
                        }
                    }
                }
            }
        }
    }
)

# Step 2: Create matrix from query result
> Input: JSON result

> Output: Python nested list

In [6]:
eventCountByUser={}
rowIndexList=[]
colIndexList = []
for account in response["aggregations"]["actor_name"]["buckets"]:
    if account["key"] not in rowIndexList:
        rowIndexList.append(account["key"])
    for e in account["event_name"]["buckets"]:
        if e["key"] not in colIndexList:
            colIndexList.append(e["key"])
        if account["key"] not in eventCountByUser.keys():
            eventCountByUser[account["key"]] = dict()
            eventCountByUser[account["key"]][e["key"]] = e["doc_count"]
        else:
            eventCountByUser[account["key"]][e["key"]] = e["doc_count"]

matrix=[]
for i in range(0,len(rowIndexList)):
    matrix.append(([0.0]*len(colIndexList)))
for row in eventCountByUser.keys():
    for col in eventCountByUser[row].keys():
        matrix[rowIndexList.index(row)][colIndexList.index(col)]=eventCountByUser[row][col]

# Step 3: Transform Python matrix to Python numpy object

In [7]:
data = np.array(matrix)
data = data / np.linalg.norm(data)
dist = 1-cosine_similarity(data)
sim = cosine_similarity(data)

MDS()
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]

estimators = {'k_means_saas_3': KMeans(n_clusters=3),
              'k_means_saas_8': KMeans(n_clusters=8),
              'k_means_saas_bad_init': KMeans(n_clusters=3, n_init=1,init='random')}
labels = estimators["k_means_saas_8"].fit(data).labels_

# Step 4: Plot the clustering result

In [8]:
plotData=[]
for i in range(0,len(labels)):
    pData={}
    pData["x"]=xs[i]
    pData["y"]=ys[i]
    pData["c"]=labels[i]
    pData["actor_email"]=rowIndexList[i]
    plotData.append(pData)
    
df = pd.DataFrame(plotData)


p = Scatter(df, x='x', y='y', color='c',
            title="Scatter Plot of Google Users by \"Access\"",
            xlabel="1st DIM", ylabel="2nd DIM")

output_notebook()
show(p)

# Step 5: Transform the data format as node-link

In [8]:
data={}
nodes=[]
links=[]

for i in range(0,len(df)):
    node={}
    node["name"]=df["actor_email"][i]
    node["group"]=df["c"][i]
    nodes.append(node)
for i in range(0,len(sim)):
    for j in range(0,len(sim[i])):
        link={}
        link["source"]=i
        link["target"]=j
        link["value"]=sim[i][j]
        links.append(link)
        
data["nodes"]=nodes
data["links"]=links

# Step

In [9]:
nodes = data['nodes']
names = [node['name'] for node in sorted(data['nodes'], key=lambda x: x['group'])]

N = len(nodes)
counts = np.zeros((N, N))
for link in data['links']:
    counts[link['source'], link['target']] = link['value']
    counts[link['target'], link['source']] = link['value']

colormap = ["#444444", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99",
            "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"]

xname = []
yname = []
color = []
alpha = []
for i, node1 in enumerate(nodes):
    for j, node2 in enumerate(nodes):
        xname.append(node1['name'])
        yname.append(node2['name'])

        alpha.append(min(counts[i,j]/4.0, 0.9) + 0.1)

        if node1['group'] == node2['group']:
            color.append(colormap[node1['group']])
        else:
            color.append('lightgrey')

temp={}
temp["xname"]=xname
temp["yname"]=yname
temp["colors"]=color
temp["alphas"]=alpha
temp["count"]=counts.flatten()
            
source = ColumnDataSource(data=temp)

p = figure(title="Google User-User Similarity Matrix",
           x_axis_location="above", tools="hover,save",
           x_range=list(reversed(names)), y_range=names)

p.plot_width = 800
p.plot_height = 800
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect('xname', 'yname', 0.9, 0.9, source=source,
       color='colors', alpha='alphas', line_color=None)

p.select_one(HoverTool).tooltips = [
    ('names', '@yname, @xname'),
    ('count', '@count'),
]

output_file("google_sim.html", title="google similarity matrix")

show(p) # show the plot