# Get the article metadata for each article
* use the article `doi` and the CrossRef API to get the metadata associated with each article

In [4]:
import pandas as pd
import numpy as np
import os
from os.path import join

import crossref_commons.retrieval as crossref

In [7]:
dataDir = '../data'

In [8]:
articles_df = pd.read_csv(join(dataDir, 'allArticles.csv'))

In [9]:
articles_df.head()

Unnamed: 0,sha,source_x,doi,pubmed_id
0,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,10.1016/0002-9343(73)90176-9,4579077.0
1,212e990b378e8d267042753d5f9d4a64ea5e9869,Elsevier,10.1016/0002-9343(85)90367-5,2861742.0
2,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Elsevier,10.1016/0002-9343(88)90356-7,3052052.0
3,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Elsevier,10.1016/0002-9343(88)90456-1,3048091.0
4,a55cb4e724091ced46b5e55b982a14525eea1c7e,Elsevier,10.1016/0002-9343(92)90608-E,1621745.0


### Test Crossref lookup

In [10]:
test_doi = articles_df.iloc[0]['doi']

In [12]:
crossref.get_publication_as_json(test_doi)

{'DOI': '10.1016/0002-9343(73)90176-9',
 'ISSN': ['0002-9343'],
 'URL': 'http://dx.doi.org/10.1016/0002-9343(73)90176-9',
 'alternative-id': ['0002934373901769'],
 'author': [{'affiliation': [],
   'family': 'Brunner',
   'given': 'Carolyn M.',
   'sequence': 'first'},
  {'affiliation': [],
   'family': 'Horwitz',
   'given': 'David A.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Shann',
   'given': 'Mary K.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Sturgill',
   'given': 'Benjamin A.',
   'sequence': 'additional'},
  {'affiliation': [],
   'family': 'Davis',
   'given': 'John S.',
   'sequence': 'additional',
   'suffix': 'IV'}],
 'container-title': ['The American Journal of Medicine'],
 'content-domain': {'crossmark-restriction': False, 'domain': []},
 'created': {'date-parts': [[2004, 4, 20]],
  'date-time': '2004-04-20T04:58:21Z',
  'timestamp': 1082437101000},
 'deposited': {'date-parts': [[2019, 2, 11]],
  'date-time': '2019-02-11T

## Create function to extract relevant info from crossref doi

In [17]:
def getArticleMeta(doi):
    fields = [
        'title',
        'publication',
        'year',
        'URL'
    ]
    # create empty dict as default
    meta = {'doi': doi}
    for f in fields:
        meta[f] = np.nan
        
    try:
        ref = crossref.get_publication_as_json(doi)
        try:
            meta['title'] = ref['title'][0]
        except:
            pass
        
        try:
            meta['publication'] = ref['container-title'][0]
        except:
            pass
        
        try:
            meta['year'] = ref['issued']['date-parts'][0][0]
        except:
            pass
        
        try: 
            meta['URL'] = ref['URL']
        except:
            pass
        
        return meta
        
    except:
        return meta

In [18]:
getArticleMeta(test_doi)

{'URL': 'http://dx.doi.org/10.1016/0002-9343(73)90176-9',
 'doi': '10.1016/0002-9343(73)90176-9',
 'publication': 'The American Journal of Medicine',
 'title': 'Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus',
 'year': 1973}

## Apply this function to get metadata for each article

In [52]:
def addArticleMeta(row):
    meta = getArticleMeta(row['doi'])
    for k in meta.keys():
        row[k] = meta[k]
    return row

In [53]:
test_df = articles_df[:5]

In [54]:
test_df.shape

(5, 4)

In [55]:
test_df = test_df.apply(addArticleMeta, axis=1)

In [56]:
test_df

Unnamed: 0,sha,source_x,doi,pubmed_id,title,publication,year,URL
0,aecbc613ebdab36753235197ffb4f35734b5ca63,Elsevier,10.1016/0002-9343(73)90176-9,4579077.0,Clinical and immunologic studies in identical ...,The American Journal of Medicine,1973,http://dx.doi.org/10.1016/0002-9343(73)90176-9
1,212e990b378e8d267042753d5f9d4a64ea5e9869,Elsevier,10.1016/0002-9343(85)90367-5,2861742.0,Infectious diarrhea: Pathogenesis and risk fac...,The American Journal of Medicine,1985,http://dx.doi.org/10.1016/0002-9343(85)90367-5
2,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Elsevier,10.1016/0002-9343(88)90356-7,3052052.0,New perspectives on the pathogenesis of rheuma...,The American Journal of Medicine,1988,http://dx.doi.org/10.1016/0002-9343(88)90356-7
3,ddd2ecf42ec86ad66072962081e1ce4594431f9c,Elsevier,10.1016/0002-9343(88)90456-1,3048091.0,Management of acute and chronic respiratory tr...,The American Journal of Medicine,1988,http://dx.doi.org/10.1016/0002-9343(88)90456-1
4,a55cb4e724091ced46b5e55b982a14525eea1c7e,Elsevier,10.1016/0002-9343(92)90608-E,1621745.0,Acute bronchitis: Results of U.S. and European...,The American Journal of Medicine,1992,http://dx.doi.org/10.1016/0002-9343(92)90608-e


### now do it for real
Heads up that this command will take at least 1hr+ to run

In [58]:
articles_df = articles_df.apply(addArticleMeta, axis=1)

In [60]:
articles_df.tail()

Unnamed: 0,sha,source_x,doi,pubmed_id,title,publication,year,URL
28457,d4f00f66c732c292fcfc28b19f44daa2fa620901,PMC,10.1371/journal.pone.0188325,29149199.0,Epidemiology and clinical profile of pathogens...,PLOS ONE,2017.0,http://dx.doi.org/10.1371/journal.pone.0188325
28458,ec575d33c0d3b34af7644fcfed64af045a75ab63,Elsevier,10.1016/j.jmb.2008.12.029,19121325.0,Functional Analysis of the Transmembrane Domai...,Journal of Molecular Biology,2009.0,http://dx.doi.org/10.1016/j.jmb.2008.12.029
28459,7f8715a818bfd325bf4413d3c07003d7ce7b6f7e,PMC,10.1128/mBio.00898-18,29970463.0,Viral Entry Properties Required for Fitness in...,mBio,2018.0,http://dx.doi.org/10.1128/mbio.00898-18
28460,07e78e218a159c35e9599e3751a99551a271597b,Elsevier,10.1016/j.virol.2011.01.013,21324503.0,Arenavirus reverse genetics: New approaches fo...,Virology,2011.0,http://dx.doi.org/10.1016/j.virol.2011.01.013
28461,04bc03c90437934a75fc6fdc228817234ef84c3a,PMC,10.3389/fimmu.2017.01519,29167674.0,A New Immunosuppressive Molecule Emodin Induce...,Frontiers in Immunology,2017.0,http://dx.doi.org/10.3389/fimmu.2017.01519


In [61]:
articles_df.to_csv(join(dataDir, 'articlesFormatted.csv'), index=False)