In [1]:
%matplotlib inline
import IPython
import pylab

import matplotlib.pyplot as plt
import numpy as np
import pylab as py
import scipy as sp

import scipy.signal

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch_dsl.connections import connections
from datetime import datetime, timedelta
from wordcloud import WordCloud

connections.create_connection(timeout=180)
pylab.rcParams['figure.figsize'] = (16, 4)
plt.hold(True)
plt.close()

In [2]:
client = Elasticsearch()

In [3]:
def parse_iso_date(str):
    return datetime.strptime(str, '%Y-%m-%dT%H:%M:%S.%fZ')

In [4]:
def hist_data(terms=None, start='1900-01-01', end='2020-01-01'):
    q = Search(index='records-hep')[0:0]
    if terms:
        q = q.query('match', abstract=terms)
    q = q.filter('range', earliest_date={'gte': start, 'lt': end})
    q.aggs.bucket(
        'group_by_date', 
        'date_histogram', 
        field='earliest_date', 
        interval='day',
        format='date_optional_time'
    )
    return q.execute().aggregations.group_by_date.buckets

In [5]:
def unzip(data):
    x, y = zip(*[(parse_iso_date(e.key_as_string), e.doc_count) for e in data])
    return np.array(x), np.array(y)

In [6]:
def hist_data_precise(terms=None, start='1900-01-01', end='2020-01-01', show_query=False):
    q = Search(index='hep-slim')[0:0] \
        .filter("script", script="d = doc['earliest_date'].date; d.getDayOfMonth() != 1") \
        .filter('range', earliest_date={'gte': start, 'lt': end})
    q.aggs.bucket(
        'group_by_date',
        'date_histogram',
        field='earliest_date',
        interval='day',
        format='date_optional_time'
    )
    if terms:
        q = q.query('match', abstract=terms)
    if show_query:
        print(q.to_dict())
    return q.execute().aggregations.group_by_date.buckets

In [7]:
def reverse_date_references(show_query=False):
    '''Builds a reverse index from recid to the minimum dates of papers referring this recid.'''
    
    q = Search(index='hep-slim')[0:0] \
        .filter("script", script="d = doc['earliest_date'].date; d.getDayOfMonth() != 1")
    agg = q.aggs.bucket(
        'references',
        'terms',
        field='references.recid',
        size=0
    )
    agg.bucket('min_earliest_date', 'min', field='earliest_date')
    agg.bucket('min_preprint_date', 'min', field='preprint_date')
    agg.bucket('min_modification_date', 'min', field='creation_modification_date.modification_date')
    agg.bucket('min_creation_date', 'min', field='creation_modification_date.creation_date')
    if show_query:
        print(q.to_dict())
    res = q.execute().aggregations.references.buckets
    
    def extract_date(elem):
        return parse_iso_date(elem.value_as_string) if "value_as_string" in elem else None
    
    mappings = {}
    for elem in res:
        mappings[elem.key] = {
            'earliest_date': extract_date(elem.min_earliest_date),
            'preprint_date': extract_date(elem.min_preprint_date), # may be missing
            'modification_date': extract_date(elem.min_modification_date),
            'creation_date': extract_date(elem.min_creation_date)
        }
    return mappings

In [8]:
def precise_recid(show_query=False):
    '''Retrieves recid and precise earliest date.'''
    
    q = Search(index='hep-slim') \
        .fields(['self_recid', 'earliest_date']) \
        .filter("script", script="d = doc['earliest_date'].date; d.getDayOfMonth() != 1")
    if show_query:
        print(q.to_dict())
    res = []
    for hit in q.scan():
        res.append((hit.self_recid[0], datetime.strptime(hit.earliest_date[0], '%Y-%m-%d')))
    return res

In [9]:
def hist_data_selected(terms=None, start='2013-02-01', end='2016-06-01', granularity='day'):
    q = Search(index='hep-recent')[0:0] \
        .filter("script", script="d = doc['earliest_date'].date; d.getDayOfMonth() != 1 || d.getMonthOfYear() != 1")
    if terms:
        q = q.query('match', abstract=terms)
    q = q.filter('range', earliest_date={'gte': start, 'lt': end})
    q.aggs.bucket(
        'group_by_date', 
        'date_histogram', 
        field='earliest_date', 
        interval=granularity,
        format='date_optional_time'
    )
    return q.execute().aggregations.group_by_date.buckets

In [10]:
def interval_ids(date, days=7):
    start_date = datetime.strptime(date, '%Y-%m-%d')
    end_date = start_date + timedelta(days=days)
    q = Search(index='hep-analysis') \
        .fields(['self_recid']) \
        .filter('range', earliest_date={'gte': start_date, 'lt': end_date}) \
        .filter('exists', field='abstracts.value')
    
    res = []
    for e in q.scan():
        res.append(e.meta.id)
    
    return res

In [11]:
def term_vectors(ids, field, chunk=100):
    res = []
    for pos in range(0, len(ids), chunk):
        q = client.mtermvectors(
            index='hep-analysis', 
            doc_type='hep',
            ids=ids[pos:pos + chunk],
            fields=[field],
            field_statistics=False,
            term_statistics=True,
            offsets=False,
            payloads=False,
            positions=False,
            realtime=True
        )
        for e in q['docs']:
            if field in e['term_vectors']:
                res.append(e['term_vectors'][field]['terms'])
    assert len(ids) == len(res)
    return res

In [12]:
def fold_vectors(vectors):
    words = {}
    for vec in vectors:
        for word, freqs in vec.items():
            if word in words:
                #assert words[word]['term_total'] == freqs['ttf']
                #assert words[word]['doc_total'] == freqs['doc_freq']
                #words[word]['history'].append(freqs['doc_freq'])
                words[word]['term_freq'] += freqs['term_freq']
                words[word]['doc_freq'] += 1
            else:
                words[word] = {
                    'term_total': freqs['ttf'], # estimate
                    'doc_total': freqs['doc_freq'], # estimate
                    'term_freq': freqs['term_freq'],
                    'doc_freq': 1,
                    #'history': [freqs['doc_freq']],
                }
    return words