In [1]:
import os
import pickle 
import numpy as np
from elasticsearch import Elasticsearch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime 
sns.set()

%matplotlib inline

In [2]:
# connect to the mmkg elastcsearch server
es = Elasticsearch(["http://130.220.208.86:9200"])
topic_dict = {"beef ban":"crl01", "gun control":"csc02", "gay marriage":"chr01", 
              "climate change":"cst01", "refugee":"cbp02"}

concept_types = ('DBpedia:Place', 'DBpedia:Country', 'DBpedia:City', 'DBpedia:Organisation', 'DBpedia:Person', 
                'DBpedia:Company', 'DBpedia:Work', 'DBpedia:OfficeHolder', 'DBpedia:Event', 
                 'DBpedia:EthnicGroup', 'DBpedia:Disease', 'DBpedia:MilitaryConflict', 
                 'DBpedia:MusicalWork', 'Other', 'NA')
# excluded: 
# 'DBpedia:Settlement' (subclass of PopulatedPlace)
# 'DBpedia:Agent', (superclass of person and organization)

In [3]:
# stretch goal: enable scrolling 
tpc_str = 'beef ban'
max_docs = 40000
page_size = 500

index = "mmkg-doc-%s" % topic_dict[tpc_str]
query = {"size": page_size,
         "query": {"match_all": {} }
         }

In [None]:
# query for the first batch
res = es.search(index=index, body=query, _source=True, scroll="1h")
scoll_id = res['_scroll_id']

In [None]:

# goal 1 - produce concept frequency graphs

df_concept = pd.DataFrame(data=None, columns=['count']+ list(concept_types)) + ['SurfaceForms']
#df_type = pd.DataFrame(data=None, columns=['count'])
#df_type.ix["NA", 'count'] = 0

cum_doc_cnt = 0
no_concept_count = {}

while cum_doc_cnt < max_docs:
    print("{} processing docs {:10.0f} to {:10.0f}".format(datetime.now(), cum_doc_cnt+1, cum_doc_cnt+page_size) )
    
    for doc in res['hits']['hits']:
        d = doc['_source']
        mo = d['timestamp'][:8]
        if 'entities' in d:
            for e in d['entities']:
                if e['concept'] not in df_concept.index:
                    # add the row for this concept
                    df_concept.ix[e['concept'], 'count'] = 0
                    # set concept type flags 
                    if 'types' in e:
                        for tt in e['types']:
                            if tt in concept_types:
                                df_concept.ix[e['concept'], tt] = 1
                            else:
                                df_concept.ix[e['concept'], "Other"] = 1
                    else:
                        df_concept.ix[e['concept'], "NA"] = 1
                # increment concept count
                df_concept.ix[e['concept'], 'count'] += 1   
        else:
            if d["type"] not in no_concept_count:
                no_concept_count[d['type']] = 0
            no_concept_count[d['type']] += 1
    # get the next page 
    res = es.scroll(scroll='1h', scroll_id=scoll_id)
    cum_doc_cnt += len(res['hits']['hits'])

# goal 2 - color by type
# goal 3 - concept frequencies over time


2017-09-20 13:19:17.132983 processing docs          1 to        500
2017-09-20 13:19:20.114562 processing docs        501 to       1000
2017-09-20 13:19:23.237830 processing docs       1001 to       1500
2017-09-20 13:19:26.566448 processing docs       1501 to       2000
2017-09-20 13:19:28.474424 processing docs       2001 to       2500
2017-09-20 13:19:30.777282 processing docs       2501 to       3000
2017-09-20 13:19:33.226301 processing docs       3001 to       3500
2017-09-20 13:19:35.235437 processing docs       3501 to       4000
2017-09-20 13:19:37.616728 processing docs       4001 to       4500
2017-09-20 13:19:39.124385 processing docs       4501 to       5000
2017-09-20 13:19:41.061077 processing docs       5001 to       5500
2017-09-20 13:19:42.682882 processing docs       5501 to       6000
2017-09-20 13:19:43.217035 processing docs       6001 to       6500
2017-09-20 13:19:43.991564 processing docs       6501 to       7000
2017-09-20 13:19:45.526665 processing docs      

In [None]:
print(no_concept_count)
pickle.dump({'df_concept':df_concept}, 
            open(os.path.join('../data', 'concept_stats.'+tpc_str+'.'+str(cum_doc_cnt)+'.pkl'), 'wb'))

In [None]:
plt.figure(figsize=(15, 5))

ax1 = plt.subplot(1, 1, 1)
sns.set_context("talk", font_scale=1.)
sns.set_style("whitegrid")

ax1.autoscale(enable=True, tight=True)
ax1.loglog([i for i in range(len(df_concept))], df_concept['count'])
ax1.set_xlabel('concept rank')
ax1.set_ylabel('concept frequency')

In [None]:
##### df_concept.sort_values(by="count", axis=0, ascending=False, inplace=True)
df_concept.fillna(value=0, inplace=True)
df_concept.head(20)

# print out a few percentile spots of the table 
num_concept = len(df_concept)
print(num_concept)
idx_print = [i for i in range(10)]
for offset in [0, 100, 500, 10000]:
    cur_idx = [i + offset for i in idx_print]
    #print(cur_idx)
    print( df_concept.iloc[cur_idx, 0] )

In [None]:
df_type.sort_values(by="count", axis=0, ascending=False, inplace=True)
print( df_type.head(60) )

plt.figure(figsize=(10, 5))

ax1 = plt.subplot(1, 1, 1)
sns.set_context("talk", font_scale=1.)
sns.set_style("white")

ax1.semilogy([i for i in range(len(df_type))], df_type['count'])
#ax1.set_semilogy(True)

clrs = sns.color_palette("muted")



In [None]:
print ( len(res['hits']['hits']) )
print ( len(res['hits']['hits'][0]['_source']['entities']) )
print ( res['hits']['hits'][0]['_source']['timestamp'] ) 
#res['hits']['hits'][0]['_source']['entities']

In [None]:
query = {
         "size": 2,
         "query": {"match_all": {} }
         }
res = es.search(index=index, body=query, _source=True, scroll="5m")

In [None]:
len(df_concept)

In [None]:
scoll_id = res['_scroll_id']

In [None]:
scoll_id

In [None]:
res = es.scroll(scroll='5m', scroll_id=scoll_id)

In [None]:
res

In [None]:
res = es.scroll(scroll='5m', scroll_id=scoll_id)

In [None]:
res

In [None]:
np.array([i for i in range(-5, 5)]) +  5