#範例宗旨#

我們使用下列的程式碼來說明如何將ElasticSearch Aggregation的結果轉換成矩陣。

In [6]:
import pyes
import json

conn=pyes.es.ES('localhost:9200')
q = pyes.MatchAllQuery()

# 針對actor_email與geoip_countryName進行aggregation
# 每一個帳號他從哪些國家登入系統的紀錄
# 預期得到的結果是: userA從us登入幾次，從tw登入幾次....
tagg = pyes.aggs.TermsAgg('user', field= 'actor_email', sub_aggs=[]) 
tagg1 = pyes.aggs.TermsAgg('country', field= 'geoip_countryName')  
tagg.sub_aggs.append(tagg1) 
# 將tagg1加到tagg.sub_aggs裡面。
qsearch = pyes.query.Search(q) 
# This "Search" is under pyes.query, http://pydoc.net/Python/pyes/0.99.5/pyes.query
qsearch.agg.add(tagg)

rs = conn.search(query=qsearch, indices='google' ,type="drive" )
print (json.dumps(rs.aggs,indent=2))
#print rs.aggs

IndexMissingException: [google] missing

建立row與col的index List，用來對應後續es雙層aggregation的資料，以便填入矩陣中

In [None]:
rowIndexList = []
colIndexList = []
for user in rs.aggs['user']['buckets']:
    #排除空白的帳號以及已放入rowIndexList的帳號
    if user['key']!="" and user['key'] not in rowIndexList:
        rowIndexList.append(user['key'])
    
    for country in user['country']['buckets']:
        if country['key'] not in colIndexList:
            colIndexList.append(country['key'])

print len(rowIndexList)
print len(colIndexList)


建立一個空白陣列，在python中，空白矩陣List包List [[...],[...],...]
初始化矩陣

In [None]:
matrix=[]
for i in range(0,len(rowIndexList)):
    matrix.append([0.0]*len(colIndexList))
#matrix

借重rowIndexList與colIndexList, 將rs.aggs使用者與國家的次數，填入matrix裡面

In [None]:
for user in rs.aggs['user']['buckets']:
    if user['key'] in rowIndexList:
        for country in user['country']['buckets']:
            if country['key'] in colIndexList:
                matrix[rowIndexList.index(user['key'])][colIndexList.index(country['key'])]= float(country['doc_count'])

為了後續運算，借重numpy套件，幫助我們進行線性代數等相關數學處理，所以將matrix這個雙層Python List資料格式，轉換為numpy的data matrix格式

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#data user從哪些來源國家的次數
data1 = np.array(matrix)
#data user從哪些來源國家的表準化次數(介於0~1之間)
data_norm = data1 / np.linalg.norm(data1)
#user-user的距離矩陣(相鄰矩陣 proximity matrix, distance matrix, adjacency matrix)
dist_data_norm = 1-cosine_similarity(data_norm)

dist_data = cosine_similarity(data1)

In [None]:
nodes=[]
for user in rowIndexList:
    u={}
    u['group']=1
    u['name']=user
    nodes.append(u)

links=[]



for i in range(0,len(dist_data)):
    for j in range(0,len(dist_data[i])):
        l={}
        l['source']=i
        l['target']=j
        l['value']=float(dist_data[i][j])
        links.append(l)

data={}
data['links']=links
data['nodes']=nodes

In [None]:
import numpy as np

from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource



nodes = data['nodes']
names = [node['name'] for node in sorted(data['nodes'], key=lambda x: x['group'])]

N = len(nodes)
counts = np.zeros((N, N))
for link in data['links']:
    counts[link['source'], link['target']] = link['value']
    counts[link['target'], link['source']] = link['value']

colormap = ["#444444", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99",
            "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"]

xname = []
yname = []
color = []
alpha = []
for i, node1 in enumerate(nodes):
    for j, node2 in enumerate(nodes):
        xname.append(node1['name'])
        yname.append(node2['name'])

        alpha.append(min(counts[i,j]/4.0, 0.9) + 0.1)

        if node1['group'] == node2['group']:
            color.append(colormap[node1['group']])
        else:
            color.append('lightgrey')

source = ColumnDataSource(data=dict(
    xname=xname,
    yname=yname,
    colors=color,
    alphas=alpha,
    count=counts.flatten(),
))

p = figure(title="Les Mis Occurrences",
           x_axis_location="above", tools="hover,save",
           x_range=list(reversed(names)), y_range=names)

p.plot_width = 800
p.plot_height = 800
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi/3

p.rect('xname', 'yname', 0.9, 0.9, source=source,
       color='colors', alpha='alphas', line_color=None)

p.select_one(HoverTool).tooltips = [
    ('names', '@yname, @xname'),
    ('count', '@count'),
]

output_file("les_mis.html", title="les_mis.py example")

show(p) # show the plot