# Search for DOI and DOI references

Load modules and define functions. To load wosplus the `drive.cfg` must be defined first

## Prepare search

In [3]:
%%writefile drive.cfg
[FILES]
faltantes_udea.csv=14UKfewQ_5vitPkKIUZu_EBmRrUpbWvsj8SGVkVrg6TM
udea_dois_api.xlsx=1f-ZWbXwqwb0oXW5bBKqa_GqhzxWWHeXM

Writing drive.cfg


In [1]:
#load modules
import pandas as pd
import unidecode
import requests
import wosplus as wp
import re

# new functions
get_close_matches_Levenshtein=wp.get_close_matches_Levenshtein

In [3]:
def lower_unidecode_keep_alphanumeric__space(s):
    import unidecode
    import re
    return unidecode.unidecode( re.sub( '([^\s\w])+', '',s ) ).lower()

def backwards_comptibility_titles(check_text,check_mesagges_key,similarity,kwargs,item):
    """
    Arguments:
        check_text=None: only value of title='value' implemented
        check_mesagges_key=None: only 'title' key implemented
        similarity=0.6: See backwards_comptibility_titles(...) for details
        kwargs: Cross ref query API options
        item: Cross refs dictionary (json) output
        
    Compare kwargs['title']  and item['title'][0] with a similarity cutoff
    and return boolean
    """
    import sys
    titles_match=False
    if not check_text:
        try:
            check_text=kwargs.get('title').lower()
        except AttributeError:
            check_text=None
        else:
            check_text=str(check_text) # Be sure that is string

    if not check_mesagges_key:
        check_mesagges_key='title' #'container-title'

    if check_mesagges_key=='title':    
        try:
            item_title=item[check_mesagges_key][0]
        except KeyError:
            item_title=None
        if type(item_title)==str:
            item_title=lower_unidecode_keep_alphanumeric__space(item_title)

        if check_text: # Is already an string or None!
            check_text=lower_unidecode_keep_alphanumeric__space(check_text)

            chk=get_close_matches_Levenshtein(check_text,item_title,n=1,cutoff=similarity)
            if chk: # Not empty if cutoff >= similarity
                titles_match=True
    else:
        sys.exit('ERROR {}, not yet implemented',check_mesagges_key)
        
    return titles_match

def get_doi(DOI=None, #order does not matter
            backwards_compatibility=True, #If falste return full dictionary and ignore next options ====
            check_text=None,
            check_mesagges_key=None,
            similarity=0.6,
            JSON=False, # END of backwards_compatibility options ===============================
            **kwargs # CrossRef query API options: https://github.com/CrossRef/rest-api-doc#field-queries
           ):
    '''
    For DOI=None (Default):
        
        Use the API for queries of CrossRef: 
        
        https://github.com/CrossRef/rest-api-doc#field-queries
        
        in **kwargs
        
        Example kwargs:
        title='room at the bottom', author='richard feynman', ...
        
    For DOI:
        
        Search for a DOI and get the full metadata info (including references!). 
        
        Use the API for queries of CrossRef in **kwargs
        
        Example kwargs: title, author,...
           
    BACKWARDS COMPATIBILITY: Returns only matching DOI if titles are not similar! ============
        backwards_compatibility=True, #If falste return full dictionary and ignore next options ====
        check_text=None: only value of title='value' implemented
        check_mesagges_key=None: only 'title' key implemented
        similarity=0.6: See backwards_comptibility_titles(...) for details
        JSON=False: Returns only DOI if titles does not match

        The checking is doing by comparing check_text with the check_mesagges_key from the full info.
        By default the given 'title' is used for the check.
        
        The checking is doing by removing all the non-alphanumeric characters but keeping spaces.
        Also with lower and unidecode (see: lower_unidecode_keep_alphanumeric__space)
        
        See: backwards_comptibility_titles(...) for details
        ==========================================================================
    
    EXAMPLES:
        * get_doi(title='room at the bottom', author='richard feynman')
        * get_doi('10.1103/physrevd.87.095010')          
    '''
    import re
    import requests
    import time
    import random
    #DEBUG
    #print(kwargs)
        
    query='https://api.crossref.org/works'
    if DOI:
        query=query+'/'+DOI
    else:
        query=query+'?'
        for k in kwargs.keys():
            q=kwargs[k]
            if type(q)==str:
                q=re.sub('\s+','+',q)
                query=query+'query.{}={}&'.format(k, q )
                
        query=re.sub('\&$','',query) # drop the last &
        
    #query is either a /DOI or a ?search of **kwargs
    
    #DEBUG
    #print(query)
    
    r=requests.get(query)
    try:
        item=r.json()['message']
        if item.get('items'): # It is a list a items orded by score. Pick the first one:
            #TODO: loop the list
            item=item['items']#[0]
        elif not DOI:
            item=[item]
    except:
        item={}
        
    #BACKWARDS compatibility====================
    if not DOI and backwards_compatibility: # returns only matched DOI if titles are not similar
        for ii in range(len(item)):
            check_titles=backwards_comptibility_titles(check_text,check_mesagges_key,similarity,kwargs,item[ii])

            if check_titles:
                if ii>0:
                    f=open('cr.log','a')
                    f.write('WRONG order at: {}\n'.format(i))
                    f.close()
                item=item[ii]
                break        
                
        #check_titles=backwards_comptibility_titles(check_text,check_mesagges_key,similarity,kwargs,item)
        if not JSON and not check_titles: #if JSON=True force full item output
            try:
                item=item[ii]['DOI']
            except KeyError:
                item=''
    #==============================================  
    
    time.sleep( random.randint(1,3) ) # Avoids robots.txt
    return item

## Run articles
We need a pandas DF of articles with at least the Title and journal information. Currenly only the Title is being used

In [7]:
ua=wp.read_drive_excel('faltantes_udea.csv')

In [8]:
ua.shape

(1584, 12)

Prepare search columns

In [6]:
ua['title']=ua.TITULO.str.lower().map(unidecode.unidecode)
ua['journal']=ua.REVISTA.str.lower().map(unidecode.unidecode)

### Loop

In [9]:
similarity=0.9
#LOG FILE=====
# clean log file
logfile='kkk.txt'
f=open(logfile,'w')
f.write('')
f.close()
#=============
#DEBUG------------
#i=3
#if True:
#----------
for i in ua.index:
    #LOG FILE========
    f=open(logfile,'a')
    f.write('{}\n'.format(i)) # check from the terminal with: tail -f kkk.txt
    f.close()
    #================
    j=get_doi(title=ua.loc[i,'title'],similarity=similarity)
    #Get references
    refs=''
    print(i,end='\r')
    try:
        for refd in j['reference']:
            sep=';'
            refs=refs+refd['DOI']+sep
            
        refs=re.sub('{}$'.format(sep),'',refs) #drop the last sep
    except:
        pass
            

    try: 
        ua.loc[i,'DOI']=j['DOI']
        ua.loc[i,'CR_title']=j['title'][0]
    except:
        ua.loc[i,'DOI']=''
        if not j:
            ua.loc[i,'Failed']='Yes'

    ua.loc[i,'REFS']=refs 
    ( ua.drop(['title','journal'],axis='columns').fillna('') ).to_excel('udea_dois_api_loop.xlsx',index=False)

1583

## Save file with results
And upload to Google Drive (see `drive.cfg`)

In [None]:
ua.drop(['title','journal'],axis='columns').to_excel('udea_dois_api.xlsx',index=False)

## Recover file with results

In [4]:
dfr=wp.read_drive_excel('udea_dois_api.xlsx')
dfr=dfr.fillna('')

In [6]:
dfr[dfr.REFS!='']

Unnamed: 0,IDINST,INST,CVEREVTIT,ANIO,IDREV,ISSN,IDNUM,VOLUMEN,NUMERO,PAIS,TITULO,REVISTA,DOI,REFS,CR_title
0,15344,Universidad de Antioquia,396742056003,2014,3967,1980-5411,42056,24,1,Brasil,Sacrificio cortoplacista adaptativo 2opt (SCA_2opt): Una heurística inspirada en el pensamiento sistémico,Production,10.1590/s0103-65132013005000033,10.1109/5326.725338;,Sacrificio cortoplacista adaptativo 2opt (SCA_2opt): Una heurística inspirada en el pensamiento sistémico
1,15344,Universidad de Antioquia,219130127012,2014,2191,1982-5765,30127,19,1,Brasil,DISTINTAS LECTURAS DE LA PREGUNTA COMO MEDIACIÓN DIDÁCTICA PARA LA TRADUCCIÓN DE SABERES EN LA EDUCACIÓN SUPERIOR O ACERCA DE UN ESTADO EN CUESTIÓN,Avaliação: Revista da Avaliação da Educação Superior,10.1590/s1414-40772014000100012,10.1023/A:1004138810465;,Distintas lecturas de la pregunta como mediación didáctica para la traducción de saberes en la educación superior o acerca de un estado en cuestión


Return back to [Pandas](./Pandas.ipynb#Conclusion)