We need to figure out what  DOIs have and have not been deposited with the DOAJ. 
To do that we pull a list of all DOI from Crossref
Check those against the DOAJ API
Cram everything into some dataframes and then merge those dataframes so we have a lisr off all DOI, their deposit dates (we want to upload the latest articles first) and their DOAJ upload status. 

DOAJ docs live: https://doaj.org/

In [1]:
import json
import pandas as pd
import numpy as np
# from pandas import json_normalize 
import requests
import datetime
import xml.etree.ElementTree as ET

In [2]:
jrn_code = 'NETN'
jrn = 'Network Neuroscience'


In [3]:
def check_deposit_status(row):
    doi_list = []
    respons_list = []
    try:
        URL = 'https://doaj.org/api/search/articles/doi%3A{}'.format(row['DOI'])
        r = requests.get(url = URL)
        data = r.json()
        response = data['results']
        doi_list.append(row['DOI'])
        respons_list.append(len(response))
        return pd.Series([doi_list, respons_list])
        
    except Exception as e:
        print(e)
        pass

In [4]:
# Generate list of all DOIs
try:
    URL = 'https://api.crossref.org/works?filter=container-title:{}&rows=1000&select=DOI,volume,issue,deposited,URL,abstract,author,title&sort=deposited&order=desc'.format(jrn)
    r = requests.get(url = URL)
    data = r.json()
    response = data['message']['items']
    json_object = json.dumps(response) 
    df = pd.json_normalize(response)
except Exception as e:
    print(e)
    pass

In [5]:
# Clean aand format the date info
df['deposited.date-time'] = pd.to_datetime(df['deposited.date-time'])
df['deposited.date-time'] = df['deposited.date-time'].dt.strftime('%Y-%m-%d')

# The title comes in as a list so we need to grab just the string
df["title"] = df["title"].apply(lambda x: x[0])



In [6]:
# Ping DOAJ API to determine whether a DOI has been uploaded. 
status_df = df.apply(check_deposit_status, axis=1)
status_df.columns = ['DOI', 'DOAJ_Deposit_Status']
status_df["DOI"] = status_df["DOI"].apply(lambda x: x[0])
status_df["DOAJ_Deposit_Status"] = status_df["DOAJ_Deposit_Status"].apply(lambda x: x[0])


In [7]:
# Now we have a DF that includes the DOI, the date deposited to CR< and the DOAJ upload Status
result = pd.merge(df, status_df, on="DOI")

# We want to filter this to just include those not uploaded to the DOAJ (DOAJ_Deposit_Status=0)
result = result[result['DOAJ_Deposit_Status']==0]


In [8]:
# result.to_csv('{}/result.csv'.format(jrn_code), index=None)

In [9]:
# status_df.to_csv('{}/status.csv'.format(jrn_code), index=None)

In [10]:
# df.to_csv('{}/{}.csv'.format(jrn_code,jrn_code), index=None)

## Convert Crossref JSON into DOAJ XML

### Required
- Title
- Journal title
- Publication date
- Full-text URL
- EISSN

### Optional
- Authors
- Affiliations
- Abstracts
- Publisher
- EISSN
- Volume number
- Issue number
- DOI

In [11]:
# result = result[['DOI', 'volume', 'issue', 'deposited.date-time', 'URL']]
result.drop(['author', 'deposited.date-parts', 'DOAJ_Deposit_Status', 'deposited.timestamp'], axis=1, inplace=True)
result['journalTitle'] = jrn
result['eissn'] = '2644-2353'
result['language'] = 'eng'
result['publisher'] = 'The MIT Press'
result.rename(columns={'DOI':'doi', 'URL': 'fullTextUrl1', 'deposited.date-time': 'publicationDate'}, inplace=True)


In [12]:
# We got to reorg the cols because the DOAJ schema won;t validate after we convert the DF to XL
result = result.loc[:,['language','publisher','journalTitle', 'eissn','publicationDate','volume','issue', 'doi', 'title', 'fullTextUrl1']]


In [16]:
# I just hate this but we have to deal with the names and this is what I did
#  I don;t know...it works 
dict_names = {}
def get_author(row):
    names_list = []
    try:
        for i in range(len(row['author'])):
            name = '{} {}'.format(row['author'][i]['given'], row['author'][i]['family'])
            try:
                aff = row['author'][i]['affiliation'][0]['name']
            except IndexError as IE:
                pass
            names_list.append(name)
        dict_names[row['DOI']] = names_list
        return dict_names
    except IndexError as ie: #no affs, skipping
        return dict_names
    except TypeError as te: #no authors, skipping
        return dict_names
    except KeyError as ke: #no given name, skipping
       return dict_names
df.apply(get_author, axis=1)


0      {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
1      {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
2      {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
3      {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
4      {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
                             ...                        
370    {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
371    {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
372    {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
373    {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
374    {'10.1162/netn_a_00319': ['Omid Kardan', 'Andr...
Length: 375, dtype: object

In [17]:
xml = result.to_xml(index=None)
tree = ET.ElementTree(ET.fromstring(xml))
for row_elem in tree.findall("row"):
    row_elem.tag = "record"
for data_elem in tree.findall("data"):
    data_elem.tag = "records"


In [18]:
for title_elem in tree.findall("record"):
    b = ET.SubElement(title_elem, 'authors')
    doi = list(title_elem.iter('doi'))[0].text
    try:
        for name in range(len(dict_names[doi])):
            c = ET.SubElement(b, 'author')
            d = ET.SubElement(c, 'name')
            d.text = dict_names[doi][name]
    except KeyError as ke:
        continue
    


In [19]:
# Well this part is just sad to look at...but ET doesn't provide much help and I'm not switching to lxml
for record in tree.findall("record"):
    sl1 = ET.SubElement(record, 'fullTextUrl')
    sl2 = record.find('./fullTextUrl1')
    sl1.text = record.find('./fullTextUrl1').text
    record.remove(sl2)


In [20]:
dtstamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


with open('journals/{}/{}_{}.xml'.format(jrn_code, jrn_code, dtstamp), 'wb') as f:
    tree.write(f)