# Reference Matcher Generation

In [1]:
from __future__ import absolute_import, division, print_function

from collections import defaultdict

from elasticsearch.helpers import scan
from flask import Flask
from six import iteritems

from invenio_search import InvenioSearch, current_search_client as es

from inspire_matcher import InspireMatcher, match
from inspire_utils.record import get_value

In [2]:
config = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.arxiv_eprint',
                    'search_path': 'arxiv_eprints.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.isbn',
                    'search_path': 'isbns.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.report_numbers',
                    'search_path': 'report_numbers.value.fuzzy',
                    'type': 'exact',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.page_start',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'source': [
        'control_number',
    ]
}

In [3]:
config_for_jcap_and_jhep = {
    'algorithm': [
        {
            'queries': [
                {
                    'path': 'reference.arxiv_eprint',
                    'search_path': 'arxiv_eprints.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.dois',
                    'search_path': 'dois.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.isbn',
                    'search_path': 'isbns.value.raw',
                    'type': 'exact',
                },
                {
                    'path': 'reference.report_numbers',
                    'search_path': 'report_numbers.value.fuzzy',
                    'type': 'exact',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.artid',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.artid',
                    ],
                    'type': 'nested',
                },
                {
                    'paths': [
                        'reference.publication_info.journal_title',
                        'reference.publication_info.journal_volume',
                        'reference.publication_info.year',
                        'reference.publication_info.page_start',
                    ],
                    'search_paths': [
                        'publication_info.journal_title.raw',
                        'publication_info.journal_volume',
                        'publication_info.year',
                        'publication_info.page_start',
                    ],
                    'type': 'nested',
                },
            ],
        },
    ],
    'doc_type': 'hep',
    'index': 'records-hep',
    'source': [
        'control_number',
    ]
}

In [4]:
def match_reference(reference):
    if reference.get('legacy_curated') and reference.get('recid'):
        return reference['recid']
    
    journal_title = get_value(reference, 'reference.publication_info.journal_title')
    if journal_title in ['JCAP', 'JHEP']:
        try:
            if get_value(reference, 'reference.publication_info.year'):
                reference['reference']['publication_info']['year'] = str(reference['reference']['publication_info']['year'])
            return [get_value(result, '_source.control_number') for result in match(reference, config_for_jcap_and_jhep)]
           
        except StopIteration:
            pass
    
    try:
        return [get_value(result, '_source.control_number') for result in match(reference, config)]
    except StopIteration:
        pass
    
def compare_matches(reference_matches, previous_reference_matches):
    if len(reference_matches) == 1:
        return matches[0]

    if len(previous_reference_matches) == 1:
        match_previous = filter(lambda recid: recid in previous_reference_matches, reference_matches)
        try:
            return match_previous[0]
        except IndexError:
            pass            
    return 0

In [5]:
app = Flask(__name__)
InvenioSearch(app)
InspireMatcher(app)

<inspire_matcher.ext.InspireMatcher at 0x7f551e2a4b90>

In [6]:
%%time

citations = defaultdict(set)

with app.app_context():
    search = scan(
        es,
        doc_type='hep',
        index='records-hep',
        query={
            '_source': [
                'control_number',
                'references',
            ],
            'query': {
                'exists': {
                    'field': 'references',
                },
            },
        },
        scroll='2d',
    )
    
    with open('new-citations.tsv', 'w') as f:
        for hit in search:
            record = hit['_source']
            control_number = record['control_number']
            references = record['references']

            record_reference_matches = []
            for reference in references:
                record_reference_matches.append(match_reference(reference))

            for index, matches in enumerate(record_reference_matches):
                reference = references[index]
                expected = reference.get('recid') or 0
                previous_matches = record_reference_matches[index - 1] if index > 0 else []
                result = compare_matches(matches, previous_matches)
                
                f.write('%d\t%d\t%d\t%r\n' % (control_number, expected, result, reference))
                
                if result:
                    citations[result].add(control_number)
print()


CPU times: user 10h 54min 16s, sys: 1h 2min 56s, total: 11h 57min 13s
Wall time: 21h 57min 14s


In [7]:
with open('new-citation-counts.tsv', 'w') as f:
    for k, vs in sorted(iteritems(citations)):
        f.write('%d\t%d\n' % (k, len(vs)))