In [88]:
import pyes
import time
import re

conn = pyes.es.ES("127.0.0.1:9200")
 
#term query (text)
tq1 = pyes.query.TermQuery(field="text", value="security")
tq2 = pyes.query.TermQuery(field="text", value="infosec")
tq3 = pyes.query.TermQuery(field="text", value="detection")
tq4 = pyes.query.TermQuery(field="text", value="malware")
tq5 = pyes.query.TermQuery(field="text", value="virus")
tq6 = pyes.query.TermQuery(field="text", value="hacker")
tq7 = pyes.query.TermQuery(field="text", value="anti")
tq8 = pyes.query.TermQuery(field="text", value="defense")

#convert time to timestamp
start_time = int(time.mktime(time.strptime('2010-01-01 00:00:00', '%Y-%m-%d %H:%M:%S'))) * 1000
end_time = int(time.mktime(time.strptime('2010-12-31 23:59:00', '%Y-%m-%d %H:%M:%S'))) * 1000

#range query (time)
ESR = pyes.ESRange(field="created_at_linux_timestamp", from_value=start_time, to_value=end_time, 
                   include_lower=True ,include_upper=False)
rq = pyes.query.RangeQuery(qrange=ESR)

#bool query (must/must_not/should)
q = pyes.query.BoolQuery(must=[rq], should=[tq1, tq2, tq3, tq4, tq5, tq6, tq7, tq8])

#search result
result = conn.search(query=q , indices='twitter2' , doc_types='tweet')


mon = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
#with url or without url inside text
url = {'t':0,'f':0}
#url impression
url_imp = '(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'

#set by user id
uid_set = set()
#set by user id per month
mon_uid_set = {}


for i in mon:
    mon_uid_set[i] = {'twitter':[],'tweet':[]}
    mon_uid_set[i]['twitter'] = set()

for i in result:
    uid_set.add(i['uid'])
    m = i['created_at'].split(' ')[1]
    mon_uid_set[m]['twitter'].add(i['uid'])
    mon_uid_set[m]['tweet'].append(i)
    if re.findall(url_imp, i['text']):
        url['t'] += 1
    else:
        url['f'] += 1    

print("The amount of twitter:", len(uid_set))
print()

print("The amount of tweets:",len(result))
print()

print("Month:\tTwitter\ttweets")
for i in mon:
    print(i, ":\t", len(mon_uid_set[i]['twitter']), "\t", len(mon_uid_set[i]['tweet']))

print()

print("With URL:\t", url['t'])
print("Without URL:\t", url['f'])

The amount of twitter: 35

The amount of tweets: 1309

Month:	Twitter	tweets
Jan :	 15 	 63
Feb :	 14 	 65
Mar :	 21 	 104
Apr :	 18 	 95
May :	 24 	 109
Jun :	 22 	 94
Jul :	 17 	 71
Aug :	 22 	 115
Sep :	 27 	 144
Oct :	 24 	 154
Nov :	 24 	 137
Dec :	 21 	 158

With URL:	 997
Without URL:	 312


In [89]:
import numpy as np

#five important number for each month
mon_num = {}
#use to draw boxplot
data_mon = {'month':[], 'number':[]}
#the id may be an expert
maybe_expert = []

for i in mon:
    mon_num[i] = {'min':0,'q1':0,'avg':0,'q3':0,'max':0}
    user_mon_tweet_amount = {}
    for j in mon_uid_set[i]['twitter']:
        user_mon_tweet_amount[j] = 0
    for j in mon_uid_set[i]['tweet']:
        user_mon_tweet_amount[j['uid']] += 1
    num = list(user_mon_tweet_amount.values())
    
    for j in num:
        data_mon['month'].append(i)
        data_mon['number'].append(j)
        
    #sort num from minimum to maximum
    num.sort()
    qu = np.percentile(num, np.arange(0, 100, 25))
    mon_num[i]['min'] = num[0]
    mon_num[i]['q1'] = qu[1]
    mon_num[i]['avg'] = round(len(mon_uid_set[i]['tweet'])/len(mon_uid_set[i]['twitter']),2)
    mon_num[i]['q3'] = qu[3]
    mon_num[i]['max'] = num[len(num)-1]
    
    for j in mon_uid_set[i]['twitter']:
        if user_mon_tweet_amount[j] > mon_num[i]['avg']:
            maybe_expert.append(j)

for i in mon:
    print(i,"\t",mon_num[i])

Jan 	 {'max': 21, 'min': 1, 'q1': 1.0, 'avg': 4.2, 'q3': 4.0}
Feb 	 {'max': 14, 'min': 1, 'q1': 1.0, 'avg': 4.64, 'q3': 5.75}
Mar 	 {'max': 17, 'min': 1, 'q1': 2.0, 'avg': 4.95, 'q3': 7.0}
Apr 	 {'max': 15, 'min': 2, 'q1': 3.0, 'avg': 5.28, 'q3': 6.75}
May 	 {'max': 15, 'min': 1, 'q1': 1.0, 'avg': 4.54, 'q3': 7.25}
Jun 	 {'max': 13, 'min': 1, 'q1': 1.0, 'avg': 4.27, 'q3': 5.0}
Jul 	 {'max': 12, 'min': 1, 'q1': 2.0, 'avg': 4.18, 'q3': 6.0}
Aug 	 {'max': 25, 'min': 1, 'q1': 2.0, 'avg': 5.23, 'q3': 8.0}
Sep 	 {'max': 25, 'min': 1, 'q1': 1.0, 'avg': 5.33, 'q3': 7.0}
Oct 	 {'max': 33, 'min': 1, 'q1': 1.0, 'avg': 6.42, 'q3': 9.0}
Nov 	 {'max': 27, 'min': 1, 'q1': 1.0, 'avg': 5.71, 'q3': 7.25}
Dec 	 {'max': 34, 'min': 1, 'q1': 2.0, 'avg': 7.52, 'q3': 9.0}


In [90]:
from bokeh.charts import Bar, output_notebook, show
from bokeh.charts.attributes import cat, color
from bokeh.layouts import row

# best support is with data in a format that is table-like
data = {'month':[], 'twitter':[], 'tweet':[]}

for i in mon:
    data['month'].append(i)
    data['twitter'].append(len(mon_uid_set[i]['twitter']))
    data['tweet'].append(len(mon_uid_set[i]['tweet']))

twitter_bar = Bar(data, values='twitter', label=cat(columns='month', sort=False), 
                  title="The number of Twitter in 2010", legend='top_right', color='blue', 
                  plot_width=400, plot_height=400)

tweet_bar = Bar(data, values='tweet', label=cat(columns='month', sort=False), 
                title="The number of Tweets in 2010", legend='top_right', color='brown',
                plot_width=400, plot_height=400)

output_notebook()
show(row(twitter_bar, tweet_bar))

In [91]:
from bokeh.charts import Donut

pie = Donut([url['f'],url['t']])

output_notebook()
show(pie)

In [92]:
from bokeh.charts import BoxPlot

#data_mon is built in the second block

box = BoxPlot(data_mon, values='number', label=cat(columns='month', sort=False), 
              title="The number of tweets of each twitter in 2010", color='month',
              plot_width=400, plot_height=400)

output_notebook()
show(box)

In [93]:
#find expert
expert = set()
for i in maybe_expert:
    expert.add(i)
    
ex_num = {'id':[],'num':[]}

for i in expert:
    ex_num['id'].append(i)
    ex_num['num'].append(maybe_expert.count(i))

expert_bar = Bar(ex_num, values='num', label='id', 
                title="The possible experts of InfoSec in 2010", legend='top_right', color='id')

output_notebook()
show(expert_bar)

In [54]:
analysis = {'twitter':[],'label':[0,0,0,1,1,0,0,1,2,1,1,1,1,0,0,0,1,0,0,1,0,0],
            'keyword':['security','infosec','detection','malware','virus','hacker','anti','defense']}

res_for_a = {}

for i in expert:
    analysis['twitter'].append(i)
    res_for_a[i] = []
    
    #search the tweets the twitter said
    aq = pyes.query.TermQuery(field="uid", value=i)
    bq = pyes.query.BoolQuery(must=[rq,aq], should=[tq1, tq2, tq3, tq4, tq5, tq6, tq7, tq8])
    
    res = conn.search(query=bq , indices='twitter2' , doc_types='tweet')
    for j in res:
        res_for_a[i].append(j['text'])
    
    #show the description of the twitter -- for labelling
    #print(i,'\t',res[0]['user']['description'])

{'label': [0, 0, 0, 1, 1, 0, 0, 1, 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0], 'twitter': [17775619, 21501463, 9956632, 40072739, 16935717, 97450156, 13850552, 15270591, 19206209, 16103879, 14780493, 14118608, 18252252, 20492381, 18476766, 38956896, 100361060, 83342567, 17604714, 3247471, 135907568, 14415986], 'keyword': ['security', 'infosec', 'detection', 'malware', 'virus', 'hacker', 'anti', 'defense']}


In [132]:
matrix = [[0 for x in range(len(analysis['keyword']))] for y in range(len(analysis['twitter']))]

for i in range(len(analysis['twitter'])):
    for j in range(len(analysis['keyword'])):
        for k in res_for_a[analysis['twitter'][i]]:
            if re.findall(analysis['keyword'][j], k):
                matrix[i][j] = 1
                break

print(Matrix)

[[1, 0, 0, 1, 0, 1, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 1], [1, 1, 1, 1, 0, 0, 0, 1], [1, 1, 1, 1, 1, 1, 0, 0], [1, 0, 0, 0, 1, 1, 0, 1], [1, 0, 1, 0, 0, 0, 0, 0], [1, 1, 0, 1, 1, 1, 1, 0], [1, 0, 0, 1, 1, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 1, 1], [1, 0, 0, 0, 0, 1, 1, 1], [1, 1, 0, 1, 0, 1, 1, 1], [1, 0, 1, 1, 1, 1, 0, 0], [1, 0, 0, 1, 0, 1, 1, 0], [1, 0, 1, 1, 0, 0, 0, 1], [1, 0, 1, 1, 1, 1, 1, 0], [1, 0, 0, 0, 1, 0, 1, 1], [1, 1, 0, 1, 1, 0, 0, 1], [1, 1, 0, 1, 1, 1, 1, 0], [1, 1, 1, 0, 0, 0, 1, 0], [1, 1, 1, 1, 1, 1, 1, 0]]


In [134]:
#decision tree algorithm
from sklearn import tree as t

mtoa = np.array(matrix)
ltoa = np.array(analysis['label'])

analysis_data = {'label_name':[0,1,2],'data':mtoa,'label':ltoa,
                 'keyword':['security','infosec','detection','malware','virus','hacker','anti','defense']}
x = mtoa
y = ltoa

dt = t.DecisionTreeClassifier(max_depth = 5)
dt = dt.fit(x,y)
print(dt)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [143]:
from bokeh.plotting import figure

colormap = {0: 'red', 1: 'green', 2: 'blue'}
colors = [colormap[x] for x in analysis_data['label']]


fig_data = {}
for i in analysis_data['keyword']:
    fig_data[i] = []
for i in range(len(matrix)):
    for j in range(len(matrix[i])):
        fig_data[analysis_data['keyword'][j]].append(matrix[i][j])

for i in analysis_data['keyword']:
    fig = figure(title = "Analysis",plot_width=400, plot_height=400)
    fig.xaxis.axis_label = 'Label'
    fig.yaxis.axis_label = i

    fig.circle(analysis_data['label'], fig_data[i],
                color=colors, fill_alpha=0.2, size=10)

    output_notebook

    show(fig)

In [130]:
from IPython.display import Image

dot_data = t.export_graphviz(dt, out_file=None,
                             feature_names=analysis_data['keyword'],  
                             class_names=analysis_data['label_name'])  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())

AttributeError: 'dict' object has no attribute 'keyword'

In [None]:
from bokeh.charts import HeatMap

hm = HeatMap()