In [46]:
# DEPENDENCIES
import json
import pandas as pd

from pathlib import Path
from pandas.io.json import json_normalize

# CONSTANTS
DATA = Path.cwd().parent / 'data'
ZIPFILES = DATA / 'zipfiles'
LITCOVID = DATA / 'litcovid'
RAW = 'Link_Cov_P_database_2020_04_%d__9233.zip'
CLEAN = 'Link_Cov_P_database_2020_04_%d__6381.zip'

# FUNCTIONS
def crossref(data):
    for element in data:
        xref = element['xref']
        if isinstance(xref, str):
            continue

        yield xref['message']

In [47]:
current_raw = pd.read_csv(ZIPFILES / (RAW % 11), compression = 'zip')

with (LITCOVID / 'litcovid.json').open('r') as f:
    xref = json_normalize(crossref(json.load(f)))

# Adding `id_master` missing values

In [49]:
ids = range(1, current_raw['id_master_1'].size + 1)

current_raw['id_master_1'] = ids
current_raw['id_master_2'] = ids

# Processing Crossref data

In [50]:
# Lowering column names
xref.columns = xref.columns.str.lower()

# Desired columns for merging
columns = ['doi', 'publisher', 'link', 'deposited.date-time', 'issn', 'reference']
renaming = ['xpublisher', 'xlink', 'xdeposited.date-time', 'xissn', 'xreference']

# Prefixing Crossref columns with `x`
xref = xref[columns].rename(columns = dict(zip(columns[1:], renaming)))

# Merging

In [51]:
# Getting LitCovid rows
litcovid = current_raw[current_raw['id_lit'].notna()]

In [52]:
merged = pd.merge(litcovid, xref, how = 'inner', on = 'doi')
merged.shape

(1207, 185)

In [53]:
new_raw = pd.merge(current_raw, xref, how = 'left', on = 'doi')
new_raw.shape

(9236, 185)

In [54]:
new_raw['has_xref'] = new_raw['xpublisher'].notna()
new_raw['has_xref'].sum()

1309

# Saving new raw file

In [55]:
new_raw.to_csv(ZIPFILES / (RAW % 12), compression = 'gzip')

# Saving new clean

In [56]:
new_raw[new_raw['current_version']].to_csv(ZIPFILES / (CLEAN % 12), compression = 'gzip')