# Analysis of Google app for work Log Tutorial (1)

In this case, we use *pyes* python library to retrieve *Google app for work* log data for data clustering, aggregation and visualization.

Transform the log to user-user matrix and use bokeh to visualize the matrix

## Step 1: Connect to Elasticsearch via **pyes**

### Installation of pyes
pip install pyes==0.99.2 (This "Search" is under pyes.query, http://pydoc.net/Python/pyes/0.99.5/pyes.query
)

We hope to get summary of login locations (country) for each user.

To aggregate two fields "actor_email" and "geoip_countryName"


In [12]:
import pyes
import json

r='actor_email'
c='geoip_countryName'
es_address='114.32.24.166:9200'

conn=pyes.es.ES(es_address)
q = pyes.MatchAllQuery()


tagg = pyes.aggs.TermsAgg('user', field= r, sub_aggs=[]) 
tagg1 = pyes.aggs.TermsAgg('country', field= c)  
tagg.sub_aggs.append(tagg1) 

qsearch = pyes.query.Search(q) 
qsearch.agg.add(tagg)

rs = conn.search(query=qsearch, indices='google' ,type="drive" )
print json.dumps(rs.aggs,indent=2)

NoServerAvailable: list index out of range

setup index list of row and column for filling in data in to matrix

In [4]:
rowIndexList = []
colIndexList = []
for user in rs.aggs['user']['buckets']:
    #排除空白的帳號以及已放入rowIndexList的帳號
    if user['key']!="" and user['key'] not in rowIndexList:
        rowIndexList.append(user['key'])
    
    for country in user['country']['buckets']:
        if country['key'] not in colIndexList:
            colIndexList.append(country['key'])

print len(rowIndexList)
print len(colIndexList)


36
9


To creat an empty matrix, in Python the matrix is nested List, i.e., [[...],[...],...]

In [5]:
matrix=[]
for i in range(0,len(rowIndexList)):
    matrix.append([0.0]*len(colIndexList))
#matrix

Via rowIndexList and colIndexList, put login country times of each user from rs.aggs into matrix

In [6]:
for user in rs.aggs['user']['buckets']:
    if user['key'] in rowIndexList:
        for country in user['country']['buckets']:
            if country['key'] in colIndexList:
                matrix[rowIndexList.index(user['key'])][colIndexList.index(country['key'])]= float(country['doc_count'])

為了後續運算，借重numpy套件，幫助我們進行線性代數等相關數學處理，所以將matrix這個雙層Python List資料格式，轉換為numpy的data matrix格式

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#data user從哪些來源國家的次數
data1 = np.array(matrix)
#data user從哪些來源國家的表準化次數(介於0~1之間)
data_norm = data1 / np.linalg.norm(data1)
#user-user的距離矩陣(相鄰矩陣 proximity matrix, distance matrix, adjacency matrix)
dist_data_norm = 1-cosine_similarity(data_norm)

dist_data = cosine_similarity(data1)

<img src="images/user-user.png" />

In [8]:
nodes=[]
for user in rowIndexList:
    u={}
    u['group']=1
    u['name']=user
    nodes.append(u)

links=[]



for i in range(0,len(dist_data)):
    for j in range(0,len(dist_data[i])):
        l={}
        l['source']=i
        l['target']=j
        l['value']=float(dist_data[i][j])
        links.append(l)

data={}
data['links']=links
data['nodes']=nodes

In [9]:
import numpy as np

from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource



nodes = data['nodes']
names = [node['name'] for node in sorted(data['nodes'], key=lambda x: x['group'])]

N = len(nodes)
counts = np.zeros((N, N))
for link in data['links']:
    counts[link['source'], link['target']] = link['value']
    counts[link['target'], link['source']] = link['value']

colormap = ["#444444", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99",
            "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"]

xname = []
yname = []
color = []
alpha = []
for i, node1 in enumerate(nodes):
    for j, node2 in enumerate(nodes):
        xname.append(node1['name'])
        yname.append(node2['name'])

        alpha.append(min(counts[i,j]/4.0, 0.9) + 0.1)

        if node1['group'] == node2['group']:
            color.append(colormap[node1['group']])
        else:
            color.append('lightgrey')

source = ColumnDataSource(data=dict(
    xname=xname,
    yname=yname,
    colors=color,
    alphas=alpha,
    count=counts.flatten(),
))

p = figure(title="Les Mis Occurrences",
           x_axis_location="above", tools="hover,save",
           x_range=list(reversed(names)), y_range=names)

p.plot_width = 800
p.plot_height = 800
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect('xname', 'yname', 0.9, 0.9, source=source,
       color='colors', alpha='alphas', line_color=None)

p.select_one(HoverTool).tooltips = [
    ('names', '@yname, @xname'),
    ('count', '@count'),
]

output_file("les_mis.html", title="les_mis.py example")

show(p) # show the plot