In [38]:
!pwd

/mnt/synology/retina/projects/bart/website


In [39]:
!ls

Parse+diag.bib.ipynb  Scratch_Parse diag.bib.ipynb  diag.bib
Rescale Photo.ipynb   bibtex_parser.ipynb


In [112]:
!sudo pip install latexcodec

[sudo] password for user: 


In [2]:
import latexcodec
with open('diag.bib', 'rb') as f:
    diag = f.read().decode('utf-8-sig')
diag_lines = diag.split('\r\n\r\n')

## Helper functions

In [3]:
def get_blocks(content, start_character, delim=('{','}')):
    '''
    yields all blocks (entries enclosed by the specified delimiters)
    start_character will look backwards from the start of the block for this character
    the result will be a tuple of two strings: from start character to start of the block, and the block content
    '''
    delim_start, delim_end = delim
    stack = []
    for i, c in enumerate(content):
        if c == '{':
            stack.append(i)
        elif c == '}' and stack:
            start = stack.pop()
            if len(stack)==0:
                start_index = content.rfind(start_character, 0, start)
                yield content[start_index: start], content[start + 1: i]

assert [x for x in get_blocks('abc = {test}, bac = {test2}', 'a')] == [('abc = ', 'test'), ('ac = ', 'test2')]

In [4]:
def tokenize(string, delim=('{', '}'), trim=False):
    '''
    yields tokens instead of characters
    everything enclosed by the delimitors is returned as one token
    '''
    delim_start, delim_end = delim
    stack = []
    if trim:
        inp = string[1:-1]
    else:
        inp = string
    for i, c in enumerate(inp):
        if c == delim_start:
            stack.append(i)
        elif c == delim_end and stack:
            start = stack.pop()
            if len(stack) == 0:
                if trim:
                    yield(string[start+2:i+1])
                else:
                    yield(string[start:i+1])            
            continue
        if len(stack) == 0:
            yield c
            
assert [x for x in tokenize('a{abc}c', delim=('{', '}'))] == ['a', '{abc}', 'c']
assert [x for x in tokenize('{a{abc}c{def}gh{i}}', delim=('{', '}'), trim=True)] == ['a', 'abc', 'c', 'def', 'g', 'h', 'i']

In [5]:
def list_split(lst, token):
    '''
    splits a list on a token
    yields individual parts of the list, split by the token
    '''
    last = -1
    for i, t in enumerate(lst):
        if t==token:
            yield lst[last+1:i]
            last = i
    yield lst[last+1:]            

assert [x for x in list_split([1, 2, 3, 4], 2)]  == [[1], [3, 4]]

In [6]:
def token_split(token_list, pattern=' and '):
    '''
    splits a tokenlist on a pattern
    yields individual parts of the tokenlist, split by the pattern
    '''
    pattern_index = 0
    last = -1
    for i, t in enumerate(token_list):
        if t == pattern[pattern_index]:
            pattern_index += 1
            if pattern_index == len(pattern):
                yield token_list[last+1:i-len(pattern)+1]
                last = i 
                pattern_index = 0
        else:
            pattern_index = 0
    yield token_list[last+1:]
    
assert [x for x in token_split([1, 2, 1, 'test', 2, 'test'], pattern = [1, 'test'])] == [[1, 2], [2, 'test']]

In [7]:
def rindex(lst, token):
    '''
    returns the last index of token in lst
    '''
    return next(i for i, v in zip(range(len(lst)-1, -1, -1), reversed(lst)) if v == token)

assert rindex('abc abc', 'a') == 4

## Parse author names 

In [8]:
def parse_name(name, omit=('{', '}')):
    '''
    assumes this format:
    https://tex.stackexchange.com/questions/557/how-should-i-type-author-names-in-a-bib-file
    cleans the string from characters in 'omit'
    
    returns a tuple (first, von, last, jr)
    '''
    parts = list(list_split(name, ','))
    if len(parts)==1:# "First von Last"
        if ' ' in name:
            s, e = name.index(' '), rindex(name, ' ')
        else: 
            s, e = 0, 0
        first = name[:s]
        von = name[s:e]
        last = name[e:]
        jr = ''
    elif len(parts)==2: # "von Last, First"
        first = parts[1]
        e = rindex(parts[0], ' ') if ' ' in parts[0] else 0
        von = parts[0][:e]
        last = parts[0][e:]
        jr = ''
    elif len(parts)==3: # "von Last, Jr, First"
        first = parts[2]
        e = rindex(parts[0], ' ') if ' ' in parts[0] else 0
        von = parts[0][:e]
        last = parts[0][e:]
        jr = parts[1]
    else:
        print('warning! bibtex format error in name "{}"'.format(''.join(name)))
        first, von, last, jr = '', '', name, ''
               
    def clean(name_part):
        return ''.join(letter 
                       for token in name_part
                       for letter in token 
                       if not letter in omit).strip()
    return tuple(clean(x) for x in (first, von, last, jr))

assert parse_name('Bart Liefers') == ('Bart', '', 'Liefers', '')
assert parse_name('Bart von Liefers') == ('Bart', 'von', 'Liefers', '')
assert parse_name('Liefers, Bart') == ('Bart', '', 'Liefers', '')
assert parse_name('von Liefers, Bart') == ('Bart', 'von', 'Liefers', '')
assert parse_name('von Liefers, Jr, Bart') == ('Bart', 'von', 'Liefers', 'Jr')

In [9]:
def parse_authors(author_line):
    '''
    returns a list of author tuples
    ''' 
    author_line = author_line.replace('~', ' ')
    # remove enclosing braces
    try:
        cleaned_line = next(get_blocks(author_line, ''))[1] 
    except:
        print('error in author line: {}'.format(author_line))
        cleaned_line = author_line[1:-1]


    #split on ' and ' tokens (but not between brackets!) to split out the separate authors
    authors = token_split(list(tokenize(cleaned_line)), pattern=' and ')
    return [parse_name(author) for author in authors]

assert parse_authors('{a and bandc and d { and } e}') == [('', '', 'a', ''), ('', '', 'bandc', ''), ('d', 'and', 'e', '')]

## Parse entries

In [10]:
def get_entry_content(content):
    '''
    returns a dict mapping the bib-keys to 
    '''
    def get_key_value(lines):
        for line in lines:
            if not '=' in line:
                continue
            i = line.index('=')
            key = line[:i].strip()
            value = line[i + 1:].strip()
            if value.startswith('{'):
                value = value[:rindex(value, '}') + 1]
            elif value.endswith(','):
                value = value[:-1]
            yield key, value
    return  {k: v for k, v in get_key_value(content.split('\r\n'))}

def get_entry(entry):
    '''
    returns a tuple: the bibkey and a dict containing the key-values in this entry
    '''
    key_index = entry.index(',')
    bib_key = entry[:key_index]
    content = get_entry_content(entry[key_index:])
    return bib_key, content

## Indexing 

In [11]:
from collections import defaultdict

In [12]:
def decode_latex(input_string, print_error_key=False):
    '''
        replace latex characters with unicode
    '''
    try:
        return input_string.encode('utf-8').decode('latex')
    except Exception as e:
        if print_error_key:
            print('{} : warning: encoding error!!!'.format(print_error_key))
            print(e)
        return input_string

In [13]:
def authors_to_string(authors):
    def parse_author(author):
        first, von, last, jr = author
        
        if len(first)>=2 and first[1]=='.':
            # first probably contains initials
            initials = first
        else:
            initials = '.'.join(x[0] for x in first.split(' ') if x)
            if len(initials):
                initials += '.'
            
        result = initials        
        if not von == '':
            if len(von)>=2 and von[1]=='.':
                # von probably contains some initials
                result += von
            else:
                result += ' ' + von
        result += ' ' +last
        if not jr == '':
            result += ' ' + jr
        return result
    
    names = [parse_author(a) for a in authors]
    return ', '.join(names[:-1]) + ' and ' + names[-1] 

def clean_bib_string(bib_string):
    if not bib_string[0] == '{' and bib_string[-1] == '}':
        print('unexpected content:', bib_string)
    return ''.join(b for b in tokenize(bib_string, trim=True))

In [14]:
class BibItem:
    
    url_syntax = {
        'doi': 'http://dx.doi.org/{}',
        'pmid': 'http://www.ncbi.nlm.nih.gov/pubmed/{}',
        'url': '{}'
    }
    
    def __init__(self, key, entry, entry_type):
        self.key = key
        self.entry = entry
        self.entry_type = entry_type
        self.values = {}
    
    def __getattr__ (self, key):
        if key in self.values:
            # memoization
            return self.values[key]
        
        if key == 'author':
            result = self._get_authors()
        elif key in ('journal', 'booktitle', 'title', 'series'):
            result = self._get_string_rule_or_decode(key)
        elif key in ('year', 'volume', 'pages', 'number'):
            result = self._get_simple_value(key).replace('--', '-')
        elif key in ('doi', 'pmid', 'url'):
            result = self._get_url(key)
        else:
            result = ''
        self.values[key] = result
        return result

    def _get_authors(self):
        if not 'author' in self.entry:
            a = []    
        a = parse_authors(decode_latex(self.entry['author']))
        return authors_to_string(a)
    
    def _get_string_rule_or_decode(self, item):
        if not item in self.entry:
            return ''
        result = self.entry[item]
        if result.startswith('{'):
            return clean_bib_string(decode_latex(result))
        else:
            return result

    def _get_simple_value(self, key):
        if not key in self.entry:
            return ''
        else:
            value = self.entry[key]
            if value.startswith('{'):
                return value[1:-1]
            else:
                return value 
    
    def _get_url(self, key):
        url = self._get_simple_value(key)
        if len(url):
            return self.url_syntax[key].format(url)
        return ''

# Formatting 

In [15]:
def converter(value_convert, bib_item):
    def get_val(k, v):
        if not k in bib_item.entry:
            return ''
        return value_convert[k].format(get_from_string_rule(getattr(bib_item, k)))
    return {k:get_val(k, v) for k, v in value_convert.items()}

def HTML_proceedings_formatter(bib_item):
    value_convert = {
        'booktitle': 'in: <i>{}</i>',
        'volume': ', volume {}',
        'series': ' of {}'
    }
    for k in ('year', 'number', 'pages'):
        value_convert[k] = ', {}'
    return '{booktitle}{volume}{series}{year}{number}{pages}'.format(**converter(value_convert, bib_item));

def HTML_abstract_formatter(bib_item):
    value_convert = {
        'booktitle': 'in: {}', # italics?
        'year': ', {}', 
    }
    return '{booktitle}{year}'.format(**converter(value_convert, bib_item))

def HTML_article_formatter(bib_item):
    value_convert = {
        'year': ' {}',
        'volume': ';{}',
        'number': '({})',
        'pages': ':{}'
    }
    values = converter(value_convert, bib_item)
    values['journal'] = ''
    journal = get_from_string_rule(bib_item.journal)
    if bib_item.journal.startswith('arXiv'):
        arxiv_id = journal[:6];
        values['journal'] = ', <a href="https://arxiv.org/abs/{}">{}</a>'.format(arxiv_id, journal)
    else:
        values['journal'] = ', <i>{}</i>'.format(journal)
    return '{journal}{year}{volume}{number}{pages}'.format(**values)            

def get_HTML_Thesis_formatter(name):
    value_convert = {
        'school': ', {}',
        'year': ', {}', 
    }
    def func(bib_item):
        return '<i>{name}</i>{school}{year}'.format(name=name, **converter(value_convert, bib_item))
    return func

def HTML_Patent_formatter(bib_item):
    value_convert = {
        'year': '{}',
        'nationality': ', {}', 
        'optnumber': ', patent number {}'
    }
    return '{year}{nationality}{optnumber}'.format(**converter(value_convert, bib_item))            

def HTML_formatter(bib_item):
    type_formatters = {
        '@InProceedings': HTML_proceedings_formatter,
        '@Conference': HTML_abstract_formatter,
        '@Article': HTML_article_formatter,
        '@PhdThesis': get_HTML_Thesis_formatter('PhD thesis'),
        '@Mastersthesis': get_HTML_Thesis_formatter('Mastersthesis'),
        '@Patent': HTML_Patent_formatter
    }
    if bib_item.entry_type in type_formatters:
        return type_formatters[bib_item.entry_type](bib_item)
 
  

# Make index 

In [63]:
from collections import Counter

In [64]:
Counter(v.entry_type for v in global_index.values())

Counter({'@Article': 2903,
         '@Book': 20,
         '@Conference': 212,
         '@Electronic': 1,
         '@InBook': 2,
         '@InCollection': 2,
         '@InProceedings': 346,
         '@MastersThesis': 4,
         '@Misc': 3,
         '@Patent': 3,
         '@PhdThesis': 170})

In [16]:
index = defaultdict(dict)
global_index = {}
string_rules = {}
for type_name, entry in get_blocks(diag, start_character='@'):
    if type_name == '@Comment':
        continue
    elif type_name == '@String':
        k, v = [x.strip() for x in entry.split('=')]
        string_rules[k] = v
    else:
        key, entry_dict = get_entry(entry)
        bib_item = BibItem(key, entry_dict, type_name)
        global_index[key] = bib_item
        index[type_name][key] = bib_item

In [17]:
def find_key(text):
    for k, bib_item in global_index.items():
        if text in clean_bib_string(bib_item.entry['title']):
            return k

In [18]:
find_key('Non-solid and Part-solid Nodules: Compa'), find_key('Automatic Cerebrospinal Fluid Segmentation')

('Silv17a', 'Pate17a')

In [19]:
global_index['Pate17a'].entry_type

'@InProceedings'

In [58]:
sorted(global_index.items())

[('04', <__main__.BibItem at 0x7f5a67730d30>),
 ('04a', <__main__.BibItem at 0x7f5a67730f60>),
 ('Aald17', <__main__.BibItem at 0x7f5a676d77f0>),
 ('Aald17a', <__main__.BibItem at 0x7f5a676dcd68>),
 ('Aalt11', <__main__.BibItem at 0x7f5a7c05a668>),
 ('Aarn11', <__main__.BibItem at 0x7f5a7c05cb70>),
 ('Aarn12', <__main__.BibItem at 0x7f5a7c060898>),
 ('Aarn12a', <__main__.BibItem at 0x7f5a7c05c358>),
 ('Aarn12b', <__main__.BibItem at 0x7f5a7c05cfd0>),
 ('Aarn13', <__main__.BibItem at 0x7f5a7c05c780>),
 ('Aarn13a', <__main__.BibItem at 0x7f5a7c05aac8>),
 ('Aarn13b', <__main__.BibItem at 0x7f5a7c060470>),
 ('Aarn16', <__main__.BibItem at 0x7f5a7c0b5e48>),
 ('Aars89', <__main__.BibItem at 0x7f5a7c060c50>),
 ('Aart06', <__main__.BibItem at 0x7f5a7c066c88>),
 ('Aart06a', <__main__.BibItem at 0x7f5a7c063780>),
 ('Aart07', <__main__.BibItem at 0x7f5a7c0657b8>),
 ('Aart07a', <__main__.BibItem at 0x7f5a7c066470>),
 ('Aart08', <__main__.BibItem at 0x7f5a7c063b70>),
 ('Aart08a', <__main__.BibItem 

In [59]:
entry = global_index['04']

In [61]:
entry.entry


{'abstract': '{Gene therapy delivers a functional gene into a target cell to restore the physiological levels of the deficient gene (1, 2). This therapeutic methodology has been used in the clinic to treat diseases and cancers induced by genetic disorders (3). For example, a cytotoxic gene can destroy a specific type of cancer cell without excessive damage to normal tissue (4). Because viruses are inherently capable of packaging nucleic acids and delivering them to susceptible cells with high efficiency, they become a preferred system for natural gene therapy delivery (5). Viral vectors can be engineered with a gene expression cassette or converted to a targeted vector. Both methods allow delivery of the gene of interest (transgene) to the desired cells or tissues (3). As a gene therapy vector, the viruses need to meet three criteria: safety, high transfer efficiency, and reliable transgene expression. Assessment of the development of viral vectors in vivo becomes an important step in 

In [60]:
entry.entry_type

'@Article'

In [71]:
entry.entry

{'abstract': '{Gene therapy delivers a functional gene into a target cell to restore the physiological levels of the deficient gene (1, 2). This therapeutic methodology has been used in the clinic to treat diseases and cancers induced by genetic disorders (3). For example, a cytotoxic gene can destroy a specific type of cancer cell without excessive damage to normal tissue (4). Because viruses are inherently capable of packaging nucleic acids and delivering them to susceptible cells with high efficiency, they become a preferred system for natural gene therapy delivery (5). Viral vectors can be engineered with a gene expression cassette or converted to a targeted vector. Both methods allow delivery of the gene of interest (transgene) to the desired cells or tissues (3). As a gene therapy vector, the viruses need to meet three criteria: safety, high transfer efficiency, and reliable transgene expression. Assessment of the development of viral vectors in vivo becomes an important step in 

# Checks

In [105]:

def check(bib_item):
    if 'title' in bib_item.entry and bib_item.entry['title'].endswith('.}'):
        title = bib_item.entry['title']
        raise Exception('wrong title')

In [110]:
for bib_key, bib_item in global_index.items():
    try:
        check(bib_item)
    except Exception as e:
        print(e, bib_key)
        print(bib_item.title)
        fix = input("Fix? ")
        if fix in ('y', 'yes', 'j', 'ja'):
            bib_item.entry['title'] = bib_item.entry['title'][:-2] + '}'


wrong title Stul11
Dot1 binding induces chromatin rearrangements by histone methylation-dependent and -independent mechanisms.
Fix? y
wrong title Sans97
Dehydration in the elderly: strategies for prevention and management.


KeyboardInterrupt: 

In [111]:
global_index['Stul11'].entry

{'abstract': '{Methylation of histone H3 lysine 79 (H3K79) by Dot1 is highly conserved among species and has been associated with both gene repression and activation. To eliminate indirect effects and examine the direct consequences of Dot1 binding and H3K79 methylation, we investigated the effects of targeting Dot1 to different positions in the yeast genome. Targeting Dot1 did not activate transcription at a euchromatic locus. However, chromatin-bound Dot1 derepressed heterochromatin-mediated gene silencing over a considerable distance. Unexpectedly, Dot1-mediated derepression was established by both a H3K79 methylation-dependent and a methylation-independent mechanism; the latter required the histone acetyltransferase Gcn5. By monitoring the localization of a fluorescently tagged telomere in living cells, we found that the targeting of Dot1, but not its methylation activity, led to the release of a telomere from the repressive environment at the nuclear periphery. This probably contr

In [None]:
def checks(global_index):
    # skip some types
    if not bib_item.entry_type in include_keys:
        print('missing fields')
        continue

    # skip if incomplete information
    if not all(k in bib_item.entry for k in include_keys[bib_item.entry_type]):
        continue

# Write

In [92]:
include_keys = {
    '@Article': ('author', 'title', 'journal', 'year', 'url'),
    '@InProceedings': ('author', 'booktitle', 'journal', 'year')
}

def fwrite_line(f, line):
    f.write((line + '\n').encode('utf-8'))
    
def write_bib(filename, string_rules, global_index):
    with open(filename, 'wb') as f:
        fwrite_line(f, '% Encoding: UTF-8\n\n')
        for k, v in string_rules.items():
            fwrite_line(f, "@String{" + k + " = " + v + '}')

        for k, bib_item in sorted(global_index.items()):
            if not bib_item.entry_type in include_keys:
                continue
            
            fwrite_line(f, bib_item.entry_type + '{' + k + ',')
            
            for keyword in include_keys[bib_item.entry_type]:
                if not keyword in bib_item.entry:
                    continue
                content = bib_item.entry[keyword]
                fwrite_line(f, '  ' + keyword + ' = ' + content + ',')
            fwrite_line(f, '}')
            fwrite_line(f)

In [93]:
write_bib('diag_bib_out.bib', string_rules, global_index)

In [48]:
!less diag.bib

% Encoding: UTF-8

@String{AA                = _Age_and_Ageing_}
@String{AAC               = _Antimicrobial_Agents_and_Chemotherapy_}
@String{AACC              = _AACN_Advanced_Critical_Care_}
@String{AAPM              = _American_Association_of_Physicists_in_Medicine_}
@String{ABDI              = _Abdominal_Imaging_}
@String{ACAEMEMED         = _Academic_Emergency_Medicine_}
@String{ACHA              = _Applied_and_Computational_Harmonic_Analysis_}
@String{ACMCS             = _ACM_Computing_Surveys_}
@String{ACMTG             = _ACM_Transactions_on_Graphics_}
@String{ACMTIS            = _ACM_Transactions_on_Information_Systems_}
@String{ACMTMS            = _ACM_Transactions_on_Mathematical_Software_}
@String{ACR               = _Anticancer_Research_}
@String{ACSCHENEU         = _ACS_Chemical_Neuroscience_}
@String{ACTAB             = _Acta_Anaesthesiologica_Belgica_}
@String{ACTANASCA         = _Acta_Anaesthesiologica_Scandinavica_}
@String{ACTBIOMAT         = _Acta_Biomaterialia_}
@S

In [36]:
global_index['Oei16a'].entry

{'author': '{Marcel T. H. Oei and Frederick J.A. Meijer and Willem-Jan {van der Woude} and Ewoud J. Smit and Bram {van Ginneken} and Rashindra Manniesing and Mathias Prokop}',
 'doi': '{10.1007/s00330-016-4592-z}',
 'journal': 'ER',
 'number': '{6}',
 'pages': '{2411-2418}',
 'title': '{Interleaving Cerebral {CT} Perfusion with Neck {CT} Angiography. {Part II}: Clinical Implementation and Image Quality}',
 'volume': '{27}',
 'year': '{2017}'}

In [37]:
global_index.keys()

dict_keys(['Mert14', 'Ritt90', 'Kan05', 'Rene03', 'Mare14', 'Peet09', 'Zijl94', 'Schr13a', 'Kort94a', 'Reek02', 'Nune12', 'Band17', 'Kobu14a', 'Stul11', 'Schw15', 'Thij14', 'Ciet14', 'Klei15a', 'Terr15', 'Stee94a', 'Sans97', 'Lave15', 'Sche08', 'Hesk14', 'Seti15a', 'Witt12b', 'Aarn13a', 'Bree00', 'Vuka09a', 'Vuka10', 'Bart15', 'Stou01', 'Galp11', 'Voog11', 'Rudy14', 'Hoog16', 'Voge07a', 'Naga16', 'Kall16a', 'Berk94a', 'Scia11', 'Bejn15', 'Hoek13b', 'Wier07', 'Terr16', 'Pete13b', 'Scho02a', 'Hoog15', 'Wand15b', 'Mord16b', 'LinE05', 'Schi97', 'Adri11c', 'Stil14', 'Stee98a', 'Arzh09b', 'Gatt14', 'Hesk12a', 'Mord17', 'Oost13', 'Hesk13a', 'Tall01', 'Veld10a', 'Puig00', 'Vos08b', 'Mets12', 'Rikx11a', 'Zand01', 'Tann11a', 'Dekk12', 'Brom13', 'Kok07', 'Brom10', 'Dale04', 'Rikx07a', 'Weij10', 'Geor16', 'Arnt16', 'Kars89', 'Kars06', 'Kuip08', 'Boog13', 'Mus17', 'Wate12', 'Scha05a', 'Obde14', 'Thij05', 'Kamm01', 'Surc14a', 'Srin10b', 'Aart09', 'Hans10a', 'Scha16', 'Nune13', 'Koui16a', 'Reic90a', 

In [21]:
def get_from_string_rule(value):
    if value in string_rules:
        return string_rules[value].replace('_', ' ').strip()
    else:
        return value

def print_item(key):
    print(key)
    bib_item = global_index[key]
    print(bib_item.author)
    print(bib_item.title)
    print(HTML_formatter(bib_item))
    print(bib_item.doi)
    print(bib_item.pmid)

    print()
    
for key in ('Hump17', 'Riel17', 'Oei16a', 'Ghaf17b', 'Mele17'):
    print_item(key)


Hump17
G.E. Humpire Mamani, A. Arinda Adiyoso Setio, B. van Ginneken and C. Jacobs
Organ detection in thorax abdomen CT using multi-label convolutional neural networks
in: <i>Medical Imaging</i>, volume 10134 of Proceedings of the SPIE, 2017
http://dx.doi.org/10.1117/12.2254349


Riel17
S.J. van Riel, F. Ciompi, C. Jacobs, M.M. Winkler Wille, E.T. Scholten, M. Naqibullah, S. Lam, M. Prokop, C. Schaefer-Prokop and B. van Ginneken
Malignancy risk estimation of screen-detected nodules at baseline CT: comparison of the PanCan model, Lung-RADS and NCCN guidelines
, <i>European Radiology</i> 2017
http://dx.doi.org/10.1007/s00330-017-4767-2
http://www.ncbi.nlm.nih.gov/pubmed/28293773

Oei16a
M.T. H. Oei, F.J.A. Meijer, W. van der Woude, E.J. Smit, B. van Ginneken, R. Manniesing and M. Prokop
Interleaving Cerebral CT Perfusion with Neck CT Angiography. Part II: Clinical Implementation and Image Quality
, <i>European Radiology</i> 2017;27(6):2411-2418
http://dx.doi.org/10.1007/s00330-016-4592-z

## Index authors

In [22]:
author_index = defaultdict(set)
for bib_key, bib_item in global_index.items():
    try:
        authors = bib_item.authors
        for first, von, last, jr in authors:
            author_index[last].add(bib_key)
    except AttributeError:
        print('--------------')
        print(bib_key)
        print('--------------')

In [23]:
parse_authors(global_index['Hoss16']['author'])

TypeError: 'BibItem' object is not subscriptable

In [233]:
global_index['Amin11']['author']

'{S. Amin and J.G. Goldin and M.R. Zeidler and E. Kleerup and P. Lu and M. Galperin-Aizenberg and E. M. van Rikxoort and D. Gjertson and D. Ross}'

In [201]:
for type_name, entries in index.items():
    print(type_name)
    print(entries[0])

@InProceedings
Abas05a,
  author    = {D. Ab\'asolo and C. G\'omez and J. Poza and M. Garc\'ia and C. I. S\'anchez and M. L\'opez},
  title     = {{EEG} background activity analysis in {A}lzheimer's disease patients with sample entropy},
  booktitle = {International Conference on Computational Bioengineering},
  year      = {2005},
  pages     = {1067--1076},
  optnote   = {DIAG, RADIOLOGY},
  owner     = {clarisa},
  timestamp = {2010.06.24},

@Conference
Amin11,
  author    = {S. Amin and J.G. Goldin and M.R. Zeidler and E. Kleerup and P. Lu and M. Galperin-Aizenberg and E. M. van Rikxoort and D. Gjertson and D. Ross},
  title     = {Air trapping on {HRCT} assessed by quantitative image analysis as an early predictor of bronchiolitis obliterans syndrome in lung transplant recipients},
  booktitle = ATS,
  year      = {2011},
  abstract  = {{RATIONALE:} {C}urrent diagnosis of bronchiolitis obliterans syndrome ({BOS}) in lung transplant recipients by spirometry identifies

In [117]:
def decode_latex(input_string, print_error_key=None):
    '''
        replace latex characters with unicode
    '''
    try:
        return input_string.encode('utf-8').decode('latex')
    except Exception as e:
        if print_error_key:
            print('{} : warning: encoding error!!!'.format(print_error_key))
            print(e)
        return input_string

In [None]:
def get_entry(entry):
    bib_key = entry[:entry.index(',')]
    content = get_entry_content(entry)
    if 'author' in content:
        content['author'] = parse_authors(decode_latex(content['author']))
    for key in 'title', 'abstract':
        if key in content:
            content[key] = decode_latex(content[key])
    return bib_key, content