We need to figure out what HDSR DOIs have and have not been deposited with the DOAJ. 
To do that we pull a list of all HDSR DOI from Crossref
Check those against the DOAJ API
Cram everything into some dataframes and then merge those dataframes so we have a lisr off all DOI, their deposit dates (we want to upload the latest articles first) and their DOAJ upload status. 


In [17]:
import json
import pandas as pd
import numpy as np
# from pandas import json_normalize 
import requests
import datetime
import xml.etree.ElementTree as ET

In [18]:
jrn_code = 'HDSR'
jrn = 'Harvard Data Science Review'

In [19]:
def check_deposit_status(row):
    doi_list = []
    respons_list = []
    try:
        URL = 'https://doaj.org/api/search/articles/doi%3A{}'.format(row['DOI'])
        r = requests.get(url = URL)
        data = r.json()
        response = data['results']
        doi_list.append(row['DOI'])
        respons_list.append(len(response))
        return pd.Series([doi_list, respons_list])
        
    except Exception as e:
        print(e)
        pass

In [20]:
# START HERE
# Generate list of all DOIs
try:
    URL = 'https://api.crossref.org/works?filter=container-title:{}&rows=1000&select=DOI,volume,issue,deposited,URL,abstract,author,title&sort=deposited&order=desc'.format(jrn)
    r = requests.get(url = URL)
    data = r.json()
    response = data['message']['items']
    json_object = json.dumps(response) 
    df = pd.json_normalize(response)
except Exception as e:
    print(e)
    pass

In [21]:
# Cleana and format the date info
df['deposited.date-time'] = pd.to_datetime(df['deposited.date-time'])
df['deposited.date-time'] = df['deposited.date-time'].dt.strftime('%Y-%m-%d')

# The title comes in as a list so we need to grab just the string
df["title"] = df["title"].apply(lambda x: x[0])



In [22]:
# Ping DOAJ API to determin whether a DOI has been uploaded. 
df = df[0:10]
status_df = df.apply(check_deposit_status, axis=1)
status_df.columns = ['DOI', 'DOAJ_Deposit_Status']
status_df["DOI"] = status_df["DOI"].apply(lambda x: x[0])
status_df["DOAJ_Deposit_Status"] = status_df["DOAJ_Deposit_Status"].apply(lambda x: x[0])

# status_df['DOI'] = status_df.apply(janky_cleaning_DOI, axis=1)
# status_df['DOAJ_Deposit_Status'] = status_df.apply(janky_cleaning_Status, axis=1)

In [23]:
# Now we have a DF that includes the DOI, the date deposited to CR< and the DOAJ upload Status
result = pd.merge(df, status_df, on="DOI")

# We want to filter this to just include those not uploaded to the DOAJ (DOAJ_Deposit_Status=0)
result = result[result['DOAJ_Deposit_Status']==0]


In [24]:
# result.to_csv('{}/result.csv'.format(jrn_code), index=None)

In [25]:
# status_df.to_csv('{}/status.csv'.format(jrn_code), index=None)

In [26]:
# df.to_csv('{}/{}.csv'.format(jrn_code,jrn_code), index=None)

## Convert Crossref JSON into DOAJ XML

### Required
- Title
- Journal title
- Publication date
- Full-text URL
- EISSN

### Optional
- Authors
- Affiliations
- Abstracts
- Publisher
- EISSN
- Volume number
- Issue number
- DOI

In [27]:
# result = result[['DOI', 'volume', 'issue', 'deposited.date-time', 'URL']]
result.drop(['author', 'deposited.date-parts', 'DOAJ_Deposit_Status', 'deposited.timestamp'], axis=1, inplace=True)
result['journalTitle'] = jrn
result['eissn'] = '2644-2353'
result['language'] = 'eng'
result['publisher'] = 'The MIT Press'
result.rename(columns={'DOI':'doi', 'URL': 'fullTextUrl', 'deposited.date-time': 'publicationDate'}, inplace=True)


In [29]:
dict_names = {}
def get_author(row):
    names_list = []
    for i in range(len(row['author'])):
        name = '{} {}'.format(row['author'][i]['given'], row['author'][i]['family'])
        aff = row['author'][i]['affiliation'][0]['name']
        names_list.append(name)
    dict_names[row['DOI']] = names_list
    return dict_names
    
df.apply(get_author, axis=1)


0    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
1    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
2    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
3    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
4    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
5    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
6    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
7    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
8    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
9    {'10.1162/99608f92.68a012eb': ['Xiao-Li Meng']...
dtype: object

In [14]:
xml = result.to_xml(index=None)
tree = ET.ElementTree(ET.fromstring(xml))
for row_elem in tree.findall("row"):
    row_elem.tag = "record"
for data_elem in tree.findall("data"):
    data_elem.tag = "records"


In [15]:
for title_elem in tree.findall("record"):
    b = ET.SubElement(title_elem, 'authors')
    doi = list(title_elem.iter('doi'))[0].text
    for name in range(len(dict_names[doi])):
        c = ET.SubElement(b, 'author')
        d = ET.SubElement(c, 'name')
        d.text = dict_names[doi][name]
    
    

Andrew Connolly
Joseph Hellerstein
Naomi Alterman
David Beck
Rob Fatland
Ed Lazowska
Vani Mandava
Sarah Stone
Pavle Avramović
Eric Siegel
Marta Stelmaszak
Kelsey Kline
Emily A. Beck
Hannah Tavalire
Jake Searcy
Kavya Mehul Shah
Ammaar Ahmed Saeed
Joseph K. Blitzstein
Thomas B. Berrett
Christl A. Donnelly


In [16]:
with open('{}/{}.xml'.format(jrn_code,jrn_code), 'wb') as f:
    tree.write(f)