<a href="https://colab.research.google.com/github/jlee2843/Peer-Review-22-23/blob/Albert-2022/API_Biorvix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import sys
print("version:", sys.version)

version: 3.9.7 (default, Sep 16 2021, 13:09:58) 
[GCC 7.5.0]


In [3]:
#installing of models
!pip install python-doi
!pip install pyarrow
!pip install multipledispatch
!pip install pixiedust

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

**Biorxiv API** information is found [here](https://api.biorxiv.org/details/medrxiv/help).<br>
**Final edit:** December 02, 2022.<br>
**NB:** Using Jenny's notebook  as a template
<br>
tqdm is used as visualization for the processing fo data. (Its manual can be found https://github.com/tqdm/tqdm#manual)
<br>
<br>
**NB: Some issues when using BioRxiv API**
1. there are empty entry e.g. no subject area declaired
    - sometimes these entries can be fixed and sometimes it cannot. For example, one can manual enter the corresponding other but there is no way to find the intended subject area when it is not given.
2. sometimes the publication date is before the preprint date
3. sometimes the date return is not in the format of 'yyyy-mm-dd' when querying for published article detail
4. sometimes the information returned have miscellanous information for example '- Genetic Institute...' in the author field which requires manual reentry of information


# Common functions

In [59]:
from pandas.core.internals.ops import Iterator
#from tqdm.notebook import tqdm, trange
from typing import Union, List
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from threading import current_thread
from pathlib import Path

import math
import pandas as pd
import time
import numpy as np
import requests
import json
import urllib, urllib.request
import pathlib
import tqdm.contrib.concurrent as tq
import doi

# common helper function
def get_total (url) -> int:
    json_info = get_json_data(url)
    return json_info["messages"][0]["total"]

def get_json_data (url):
    try:
        request_API = requests.get(url)
        #print(f"{url} request_API: {request_API}")
        return json.loads(request_API.text)
    except Exception as e:
        time.sleep(120)
        request_API = requests.get(url)
        return json.loads(request_API.text)

def get_data(url:str, loop_range:range, disable:bool):
    #print(f"values: {list(loop_range)}")
    results = []
    #print(args)
    results = tq.thread_map(get_json_data, (f'{url}/{cursor}' for cursor in loop_range), 
                            desc=f'get_json_data {current_thread().name}', total=len(loop_range),
                            disable=disable, leave=False)
    
    return results

#def get_article_detail(url, articles):
#    result = get_data(url,articles)
#
#    return result

def process_data(json_info, keys:List[str], cursor:int, disable:bool) -> List:
    journal_list = [[entry + cursor] + [getValue(journal, key) for key in keys] for entry, journal in enumerate(json_info["collection"])]
    if disable is False:
        time.sleep(0.001 * len(journal_list))
    return journal_list

def query_to_df(results, keys:List[str], col_names:List[str], loop_list:range, disable:bool) -> pd.DataFrame:
    args = zip(results, 
               [keys for _ in loop_list],
               loop_list,
               [disable for _ in loop_list])
    args = list(args)
    total = len(args)
    result_list = []
    result_list = tq.thread_map(lambda p: process_data(*p), args, desc=f'processing data {current_thread().name}', total=total, leave=False, disable=disable)
    
    data = np.array(flatten(result_list))
    #print(data)
    #print(col_names)0
    #print(f'loop_list: {loop_list}')
    return create_df(data, col_names)

# prepublish helper function
def create_prepublish_df(df:pd.DataFrame) -> pd.DataFrame:
    try:
        df['Num_of_Authors'] = df.Authors.apply(lambda x: len(x.split(';')))
        df.DOI = df.DOI.astype('str')
        df.Title = df.Title.astype('str').map(lambda x: x.strip())
        df.Authors = df.Authors.astype('str').map(lambda x: x.strip())
        df.Corresponding_Authors = df.Corresponding_Authors.astype('str').map(lambda x: x.strip())
        df.Institution = df.Institution.map(lambda x: x.strip().upper()).astype('category')
        df.Date = df.Date.map(lambda x: convert_date(x)).astype('datetime64')
        df.Type = df.Type.map(lambda x: x.strip().lower()).astype('category')
        df.Category = df.Category.map(lambda x: x.strip().title()).astype('category')
#        df.Xml = df.Xml.astype('str')
        df.Published = df.Published.astype('str')
    except Exception as e:
        print(f'Error in data format:{e.args}\n')
        print(e.with_traceback)

    return df

# helper functions for published articles
# pub = ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"]
def create_published_df(df:pd.DataFrame) ->pd.DataFrame:
    try:
        df['Num_of_Authors'] = df.Authors.apply(lambda x: len(x.split(';')))
        df.DOI = df.DOI.astype('str')
        df.pub_DOI = df.pub_DOI.astype('str')
        df.Title = df.Title.astype('str').map(lambda x: x.strip())
        df.Authors = df.Authors.astype('str').map(lambda x: x.replace('\'','').strip())
        df.Corresponding_Authors = df.Corresponding_Authors.astype('str').str.strip()
        df.Institution = df.Institution.map(lambda x: x.strip().upper()).astype('category')
        df.Category = df.Category.map(lambda x: x.strip().title()).astype('category')
        df.Journal = df.Journal.astype('str').map(lambda x: x.strip().title()).astype('category')
        #tqdm.pandas(desc="Preprint_Date conversion!")
        df.Preprint_Date = df.Preprint_Date.map(lambda x: convert_date(x)).astype('datetime64')
        #tqdm.pandas(desc="Published_Date conversion!")
        df.Published_Date = df.Published_Date.map(lambda x: convert_date(x)).astype('datetime64')
    except Exception as e:
        print(f'Error in data format:{e.args}\n')
        print(e.with_traceback)
    #    print(f'Preprint_Date: {df.Preprint_Date} Published_Date: {df.Published_Date}')
#
    return df

def convert_date(value:str) -> datetime:
    try:
        return datetime.strptime(value.strip().split(':')[0], '%Y-%m-%d')
    except Exception as e:
        print(e)
        return pd.NaT

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
        #df.Published_Date = datetime.strptime(df.Published_Date.str.strip().str.split(':')[0], '%Y-%m-%d')

def getValue(journal, key):
    result = np.NaN
    try:
        result = journal[key]
    except Exception as e:
        print (f'key: {key} journal: {journal}\n{e}')
        raise e

    finally:
        return result
  
def get_big_data(path:str, url:str, cursor:int, json_keys:List[str], col_names:List[str], step:int, disable:bool):
    result_list = [get_json_data(f'{url}/{cursor}')]
    df = query_to_df(result_list, json_keys, col_names, range(cursor, cursor + step, step), disable)
    df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
    #time.sleep(0.001)  # to visualize the progress

def multithread_processor(path:str, url:str, json_keys:List[str], col_names:List[str], step:int, loop_range:range, disable:bool):
    #print(f"values: {list(loop_range)}")
    results = []
    args = [(path, url, cursor, json_keys, col_names, step, disable) for cursor in loop_range]
    #print(f'args: {len(args)}\n{args}')
    tq.thread_map(lambda p: get_big_data(*p), args, desc='get_big_data', total=len(args))

def process_doi_data(path:str, url:str, doi:pd.Series, meta: List, col_names: List, item:int, disable:bool = False):
    results = get_data(url, doi[item:item+step], disable)
    tmp = list(results)
    df = query_to_df([r for r in tmp], meta, col_names, 
                     range(item, item + (len(tmp) * step), step), disable)
    df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))

def checkDOI(x:str):
  if doi.validate_doi(x.strip()) is None:
    raise Exception(f'invalid doi: {x.strip()}')
  else:
    return x.strip()

def getSlurmID():
    import subprocess
    id = None
    try:
        id = subprocess.Popen("squeue -u $USER | tail -1 | awk {'print $1'}", shell=True, stdout=subprocess.PIPE).stdout
    except Exception:
        id = subprocess.Popen("ssh -f arc squeue -u $USER | tail -1 | awk {'print $1'}", shell=True, stdout=subprocess.PIPE).stdout
    return str(id.read().rstrip(), 'utf-8')
    
def getSlurmDir() -> Path:
    return Path('/scratch', getSlurmID())

# returns (SparkContext, SqlContext) objects.
def getSparkSqlContext():
    import os
    import atexit
    import sys
    import re
    import pyspark
    from pyspark.conf import SparkConf
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext

import pixiedust
pixiedust.enableJobMonitor()
conflines=[tuple(a.rstrip().split(" ")) for a in open(os.environ['SPARK_CONFIG_FILE']).readlines()]
    conf=SparkConf()
    conf.setAll(conflines)
    conf.setMaster("spark://%s:%s"% (os.environ['SPARK_MASTER_HOST'],os.environ['SPARK_MASTER_PORT']))
    sc=pyspark.SparkContext(conf=conf)

    sqlCtx=SQLContext(sc)
    
    return sc, sqlCtx

freq_count = lambda x,y: x[y].value_counts()

flatten = lambda y: sorted([sublist for inner in y for sublist in inner],
                           key=lambda x:x[0])

create_df = lambda x, y: pd.DataFrame(data=x[:, 1:], index=x[:, 0], columns=y)


In [60]:
print(f'Slurm id: {getSlurmID()}')
print(f'Does {getSlurmDir()} exist: {getSlurmDir().exists()}')
getSparkSqlContext()

Slurm id: 17482823
Does /scratch/17482823 exist: True


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/18 19:35:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


(<SparkContext master=spark://mc50:8460 appName=pyspark-shell>,
 <pyspark.sql.context.SQLContext at 0x7f90653c8220>)

## Case \#1: Pre-publish Data query for given timeframe

In [8]:
# "global" variables
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'details'
server: str = 'biorxiv'
start_interval: str = '2011-01-01'
end_interval: str = '2020-12-31'
step = 100


### BioRxiv Server

In [9]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'data/prepub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.now()}'
step = 100
journal_df = None

#finding the number of records for a given time period
#tmp = f"{base_url}/{start_interval}/{end_interval}"
#tmp = 'https://api.biorxiv.org/details/biorxiv/2020-08-21/2020-08-28'

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)

multithread_processor(path, url, 
                      ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "jatsxml", "published"],
                      ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "Xml", "Published"],
                      step, range(0, get_total(url), step), True)



get_big_data:   0%|          | 0/1481 [00:00<?, ?it/s]

In [10]:
journal_df = pd.read_parquet(pathlib.Path(path))
journal_df = create_prepublish_df(journal_df)

#display(journal_df)

#### Cleaning and fixing missing data

In [11]:
import re
pd.options.mode.use_inf_as_na = True # this option check for empty strings as well

jl_fix_df = pd.DataFrame(columns=journal_df.columns)
for col in journal_df:
    jl_fix_df = pd.concat([jl_fix_df, journal_df[journal_df[col].isna()]])
    jl_fix_df = pd.concat([jl_fix_df, journal_df[journal_df[col] == '']])

pattern = re.compile('^-|$-')
#display(journal_df[journal_df.Authors.str.contains(pattern)])
jl_fix_df = pd.concat([jl_fix_df, journal_df[journal_df.Authors.str.contains(pattern)]])
jl_fix_df.to_csv('data/data_fixing.csv')
display(jl_fix_df)

Unnamed: 0,DOI,Title,Authors,Corresponding_Authors,Institution,Date,Version,Type,Category,Xml,Published,Num_of_Authors
12673,10.1101/104778,Engaging narratives evoke similar neural activ...,"Cohen, S. S.; Henin, S.; Parra, L. C.",,THE CITY COLLEGE OF THE CITY UNIVERSITY OF NEW...,2017-01-31,1,new results,Neuroscience,https://www.biorxiv.org/content/early/2017/01/...,10.1038/s41598-017-04402-4,3
12153,10.1101/092171,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,2017-01-15,1,new results,Genomics,https://www.biorxiv.org/content/early/2017/01/...,10.1109/MM.2019.2910009,4
12154,10.1101/092171,Darwin: A Hardware-acceleration Framework for ...,"Turakhia, Y.; Zheng, K. J.; Bejerano, G.; Dall...",,STANFORD UNIVERSITY,2017-01-24,2,new results,Genomics,https://www.biorxiv.org/content/early/2017/01/...,10.1109/MM.2019.2910009,4
80783,10.1101/708800,CLEC-2 suppresses calcification in cultured os...,"Kanai, T.; Sawa, Y.; Takara, K.; Kajiwara, K.;...",,OKAYAMA UNIVERSITY GRADUATE SCHOOL OF MEDICINE...,2019-07-19,1,new results,Cell Biology,https://www.biorxiv.org/content/early/2019/07/...,,8
94799,10.1101/843763,STRESS-INDUCED GENETIC CHANGE IN FLAX REVEALS ...,"Li, X.",,CASE WESTERN RESERVE UNIVERSITY,2019-11-15,1,new results,Genomics,https://www.biorxiv.org/content/early/2019/11/...,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9616,10.1101/079533,Chiral vortex dynamics on membranes is an intr...,Diego Ramirez;Daniela A. Garcia-Soriano;Ana Ra...,Petra Schwille,MAX PLANCK INSTITUTE FOR BIOCHEMISTRY,2016-10-07,1,new results,,https://www.biorxiv.org/content/early/2016/10/...,,7
17763,10.1101/146035,The Emergent Connectome in Caenorhabditis eleg...,"- DevoWorm Group, ; Alicea, B. J.",Bradly J. Alicea,ORTHOGONAL RESEARCH,2017-06-05,1,new results,Bioinformatics,https://www.biorxiv.org/content/early/2017/06/...,10.1016/j.biosystems.2018.09.016,2
18592,10.1101/152553,Project MinE: study design and pilot analyses ...,"- Project MinE Consortium, ; Van Rheenen, W.;...",Jan H Veldink,"DEPARTMENT OF NEUROLOGY, BRAIN CENTER RUDOLF M...",2017-06-20,1,new results,Genetics,https://www.biorxiv.org/content/early/2017/06/...,10.1038/s41431-018-0177-4,37
19852,10.1101/143933,The Multiple Sclerosis Genomic Map: Role of pe...,- International Multiple Sclerosis Genetics Co...,Philip L. De Jager,"COLUMBIA UNIVERSITY MEDICAL CENTER, NEW YORK, ...",2017-07-13,1,new results,Genetics,https://www.biorxiv.org/content/early/2017/07/...,10.1126/science.aav7188,39


In [13]:
#fixed_df = pd.read_csv('')
#journal_df.merge(fixed_df, 'left', left_index=True)

In [None]:
display(journal_df[journal_df.Published =='NA'])

### Publication data for Prepublication data for the given time period

In [None]:
#from tqdm import tqdm_notebook, tnrange
from tqdm.notebook import tqdm_notebook
import time
import math

#variables
#https://api.biorxiv.org/pubs/biorvix/10.1101/759530
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'pubs'
server: str ='biorxiv'
url: str = f'{base_url}/{query_type}/{server}'
path: str = f'pub_journal-{query_type}-{server}-{datetime.now()}'
step = 100
#doi = ['10.1101/856302']
doi = journal_df[journal_df.Published != 'NA'].Published

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
total = len(doi)
#def process_doi_data(path:str, url:str, doi:pd.Series, meta: List, col_names: List, item:int, loop_list, disable:bool = False):
                     
args = [(path, url, doi, 
         ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
         ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"],
         item, True) for item in range(0, total, step)]
print(f'total doi: {total} iter: {total/step}')
#tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', max_workers=5, total=len(args))
result = tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', total=len(args))


In [None]:
filtered_df = pd.read_parquet(pathlib.Path(path))
print([item for item in filtered_df.Published_Date if len(item.strip()) != 10])
filtered_df = create_published_df(filtered_df).reindex()
filtered_df.rename(columns={'pub_DOI':'Published', 'Preprint_Date':'Date'}, inplace=True)


Finding and cleaning missing data

In [None]:
pd.options.mode.use_inf_as_na = True # this option check for empty strings as well
fix_df = pd.DataFrame(columns=filtered_df.columns)
for col in filtered_df:
    fix_df = pd.concat([fix_df, filtered_df[filtered_df[col].isna()]])
    fix_df = pd.concat([fix_df, filtered_df[filtered_df[col] == '']])
    
    
display("Rows with incomplete data:", fix_df)

In [None]:
print(f'\nPrepublished Artices w/ Publication Info that have missing info: {filtered_df.isnull().sum()}\n')
#print('Published data:\n',[(name, journal_df[name].isnull().sum()) for name in journal_df.columns if journal_df[name].isnull().values.any()])
#print('Prepublish data:\n',[(name, filtered_df[name].isnull().sum()) for name in filtered_df.columns if filtered_df[name].isnull().values.any()])
fix_df = filtered_df[filtered_df['Published_Date'].isna()]
display("Rows with NaN:", fix_df)

In [None]:
filtered_df.info()
clean_df = filtered_df.dropna()
merged_df = pd.merge(journal_df, clean_df, how='right', on=['DOI','Published','Title', 'Authors','Corresponding_Authors','Institution','Category','Date','Num_of_Authors'])

In [None]:
pd.options.mode.use_inf_as_na = True
print(f'Prepublished Artices w/ Publication Info: {merged_df.isnull().sum()}')
display(merged_df)

#### General Descriptive Anaylsis

In [34]:
#summary of table
journal_df.describe(include='all', datetime_is_numeric=True)


Unnamed: 0,DOI,Title,Authors,Corresponding_Authors,Institution,Date,Version,Type,Category,Xml,Published,Num_of_Authors,NotPublished
count,148045,148045,148045,148045,148045,148045,148045.0,148045,148045,148045,148045.0,148045.0,148045
unique,107510,118008,114476,69080,34455,,25.0,5,28,146177,72872.0,,2
top,10.1101/617381,A general theory of individuated multicellularity,"Osato, N.",Thierry Mora,STANFORD UNIVERSITY,,1.0,new results,Neuroscience,https://www.biorxiv.org/content/early/2019/09/...,,,False
freq,26,19,27,48,1637,,107529.0,144703,27758,4,45212.0,,102833
mean,,,,,,2019-03-22 06:08:02.496538112,,,,,,7.165983,
min,,,,,,2013-11-07 00:00:00,,,,,,1.0,
25%,,,,,,2018-06-04 00:00:00,,,,,,4.0,
50%,,,,,,2019-07-01 00:00:00,,,,,,6.0,
75%,,,,,,2020-04-29 00:00:00,,,,,,9.0,
max,,,,,,2020-12-31 00:00:00,,,,,,64.0,


In [40]:
# published vs unpublished
def percentage (item1, item2) -> float:
    return round(item1 / (item1 + item2) * 100, 2)

journal_df['NotPublished'] = journal_df['Published'] == 'NA'
tmp_df = journal_df.groupby('DOI', as_index=False).first()
display(tmp_df)
result = freq_count(tmp_df,'NotPublished')

print(f'published: {result.iloc[0]} ({percentage(result.iloc[0],result.iloc[1])}%)')
print(f'not published: {result.iloc[1]} ({percentage(result.iloc[1],result.iloc[0])}%)')      

Unnamed: 0,DOI,Title,Authors,Corresponding_Authors,Institution,Date,Version,Type,Category,Xml,Published,Num_of_Authors,NotPublished
0,10.1101/000026,A Population Genetic Signature of Polygenic Lo...,Jeremy J Berg;Graham Coop;,Graham Coop,"UNIVERSITY OF CALIFORNIA, DAVIS",2013-11-07,1,new results,Genetics,https://www.biorxiv.org/content/early/2013/11/...,10.1371/journal.pgen.1004412,3,False
1,10.1101/000042,Routes for breaching and protecting genetic pr...,Yaniv Erlich;Arvind Narayanan;,Yaniv Erlich,WHITEHEAD INSTITUTE,2013-11-07,1,new results,Genomics,https://www.biorxiv.org/content/early/2013/11/...,10.1038/nrg3723,3,False
2,10.1101/000067,Genetics of single-cell protein abundance vari...,Frank Albert;Sebastian Treusch;Arthur H Shockl...,Leonid Kruglyak,UCLA,2013-11-07,1,new results,Genomics,https://www.biorxiv.org/content/early/2013/11/...,10.1038/nature12904,6,False
3,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,CALTECH,2013-11-07,1,new results,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,10.1109/ACC.2014.6859452,4,False
4,10.1101/000091,Designing Robustness to Temperature in a Feedf...,Shaunak Sen;Jongmin Kim;Richard M. Murray;,Shaunak Sen,INDIAN INSTITUTE OF TECHNOLOGY DELHI,2013-11-07,1,new results,Synthetic Biology,https://www.biorxiv.org/content/early/2013/11/...,,4,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107505,10.1101/872853,Variation in repeat copy number of epithelial ...,"Raposo, C. J.; McElroy, K. A.; Fuchs, S. M.",Stephen M Fuchs,TUFTS UNIVERSITY,2019-12-11,1,new results,Cell Biology,https://www.biorxiv.org/content/early/2019/12/...,10.1093/femsyr/foaa018,3,False
107506,10.1101/872879,Structure of the processive human Pol δ holoen...,"Lancey, C.; Tehseen, M.; Raducanu, V.-S.; Rash...",Alfredo De Biasio,LEICESTER INSTITUTE OF STRUCTURAL & CHEMICAL B...,2019-12-11,1,new results,Biochemistry,https://www.biorxiv.org/content/early/2019/12/...,10.1038/s41467-020-14898-6,12,False
107507,10.1101/872952,A key regulatory protein for flagellum length ...,"Atkins, M.; Tyc, J.; Shafiq, S.; Ahmed, M.; Be...",Sue Vaughan,OXFORD BROOKES UNIVERSITY,2019-12-11,1,new results,Cell Biology,https://www.biorxiv.org/content/early/2019/12/...,,10,True
107508,10.1101/873109,FGF9 and FGF10 use distinct signaling pathways...,"Yin, Y.; Ornitz, D. M.",David M Ornitz,WASHINGTON UNIVERSITY SCHOOL OF MEDICINE,2019-12-11,1,new results,Developmental Biology,https://www.biorxiv.org/content/early/2019/12/...,10.1126/scisignal.aay4353,2,False


published: 72951 (67.86%)
not published: 34559 (32.14%)


In [None]:
#freq count of Instituion
freq_count(journal_df, 'Institution')

In [None]:
#freq count of number of version
freq_count(journal_df, 'Version')

In [41]:
#freq count of Type
freq_count(tmp_df, 'Type')

new results              105228
confirmatory results       1648
contradictory results       632
                              2
withdrawn                     0
Name: Type, dtype: int64

In [None]:
#freq count Category
freq_count(tmp_df, 'Category')

#### Check

In [None]:
assert get_total(url) == journal_df.shape[0]
"Total number of papers submitted and the length of the dataframe match."

The above code runs through all papers that are submitted within selected time frame. Error discussed during the previous meeting has been fixed. 

----

## Case 2: Published Articles query for a given timefame

In [4]:
# "global" variables
#https://api.biorxiv.org/pubs/biorvix/2018-08-21/2018-08-28
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'pubs'
server: str = 'biorxiv'
start_interval: str = '2011-01-01'
end_interval: str = '2020-12-31'


### BioRvix Server

In [5]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'data/pub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.now()}'
step = 100
pubs_df = None

#finding the number of records for a given time period

pathlib.Path(path).mkdir(parents=True, exist_ok=True)
print(url)

multithread_processor(path, url, 
                      ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
                      ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"],
                      step, range(0, get_total(url), 100), True)


https://api.biorxiv.org/pubs/biorxiv/2011-01-01/2020-12-31


get_big_data:   0%|          | 0/560 [00:00<?, ?it/s]

key: preprint_author_corresponding journal: {'preprint_doi': '10.1101/026161', 'published_doi': '10.1242/jcs.195982', 'published_journal': 'Journal of Cell Science', 'preprint_platform': 'bioRxiv', 'preprint_title': 'Crucial Roles of the Arp2/3 Complex during Mammalian Corticogenesis', 'preprint_authors': 'Pei-Shan Wang; Fu-Sheng Chou; Fengli Guo; Praveen Suraneni; Sheng Xia; Sree Ramachandran; Rong Li', 'preprint_category': '', 'preprint_date': '2015-09-06', 'published_date': '2016-08-15'}
'preprint_author_corresponding'
key: preprint_author_corresponding_institution journal: {'preprint_doi': '10.1101/026161', 'published_doi': '10.1242/jcs.195982', 'published_journal': 'Journal of Cell Science', 'preprint_platform': 'bioRxiv', 'preprint_title': 'Crucial Roles of the Arp2/3 Complex during Mammalian Corticogenesis', 'preprint_authors': 'Pei-Shan Wang; Fu-Sheng Chou; Fengli Guo; Praveen Suraneni; Sheng Xia; Sree Ramachandran; Rong Li', 'preprint_category': '', 'preprint_date': '2015-09-0

In [6]:
pubs_df = pd.read_parquet(pathlib.Path(path)).sort_index(kind='mergesort', key=lambda x: x.astype(int))

In [35]:
pubs_df.head(1)
#print([(index, value, datetime.strptime(value.split(':')[0], '%Y-%m-%d').date()) for index, value in enumerate(pubs_df.Published_Date) if len(value.strip()) != 10])
print([(index, pubs_df.loc(index, 'DOI'), value) for index, value in enumerate(pubs_df.Published_Date) if len(value.strip()) != 10])



NameError: name 'pubs_df' is not defined

In [8]:

pubs_df = create_published_df(pubs_df)

#display(pubs_df)
#pubs_df.shape


In [19]:
pubs_df.info()
print(pubs_df.Published_Date.isna().values.any(), pubs_df.Preprint_Date.isna().values.any())
print(pubs_df.Published_Date.subtract(pubs_df.Preprint_Date).where(pubs_df.Published_Date >= pubs_df.Preprint_Date))
#for index in range(pubs_df.shape[0]):
#    try:
#        pubs_df.loc(index, 'Published_Date') - pubs_df.loc(index, 'Preprint_Date')
#    except Exception as e:
        #print(e)
        #print (f'index: {index} Published_Date: {')

<class 'pandas.core.frame.DataFrame'>
Index: 55979 entries, 0 to 55978
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DOI                    55979 non-null  object        
 1   pub_DOI                55979 non-null  object        
 2   Title                  55979 non-null  object        
 3   Authors                55979 non-null  object        
 4   Corresponding_Authors  55979 non-null  object        
 5   Institution            55979 non-null  category      
 6   Category               55979 non-null  category      
 7   Journal                55979 non-null  category      
 8   Preprint_Date          55979 non-null  datetime64[ns]
 9   Published_Date         55979 non-null  datetime64[ns]
 10  Num_of_Authors         55979 non-null  int64         
dtypes: category(3), datetime64[ns](2), int64(1), object(5)
memory usage: 4.9+ MB
False False
0        92 days
1        36 days
2

#### General Descriptive Analysis

In [10]:
pubs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55979 entries, 0 to 55978
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DOI                    55979 non-null  object        
 1   pub_DOI                55979 non-null  object        
 2   Title                  55979 non-null  object        
 3   Authors                55979 non-null  object        
 4   Corresponding_Authors  55979 non-null  object        
 5   Institution            55979 non-null  category      
 6   Category               55979 non-null  category      
 7   Journal                55979 non-null  category      
 8   Preprint_Date          55979 non-null  datetime64[ns]
 9   Published_Date         55979 non-null  datetime64[ns]
 10  Num_of_Authors         55979 non-null  int64         
dtypes: category(3), datetime64[ns](2), int64(1), object(5)
memory usage: 4.9+ MB


In [20]:
#summary of table
pubs_df.describe(include='all', datetime_is_numeric=True)


Unnamed: 0,DOI,pub_DOI,Title,Authors,Corresponding_Authors,Institution,Category,Journal,Preprint_Date,Published_Date,Num_of_Authors
count,55979,55979,55979,55979,55979,55979,55979,55979,55979,55979,55979.0
unique,55974,55906,55961,55396,39467,18942,28,3321,,,
top,10.1101/756668,10.1016/j.neuroimage.2020.1165,Modeling Continuous Admixture,"Chang, W. H.; Lai, A. G.",Jesse D Bloom,STANFORD UNIVERSITY,Neuroscience,Plos One,,,
freq,2,3,2,7,30,644,9941,3408,,,
mean,,,,,,,,,2018-10-20 10:04:06.049411328,2019-05-22 20:02:06.304506880,7.018418
min,,,,,,,,,2013-11-07 00:00:00,2013-12-10 00:00:00,1.0
25%,,,,,,,,,2018-01-25 00:00:00,2018-09-03 00:00:00,4.0
50%,,,,,,,,,2019-01-09 00:00:00,2019-08-29 00:00:00,6.0
75%,,,,,,,,,2019-10-17 00:00:00,2020-05-28 00:00:00,9.0
max,,,,,,,,,2020-12-30 00:00:00,2020-12-31 00:00:00,54.0


In [12]:
#freq count of Num_of_Authors
freq_count(pubs_df,'Num_of_Authors')

4     7458
3     7120
5     6774
6     5914
2     4979
7     4749
8     3810
9     2875
10    2371
11    1778
12    1370
13    1111
14     868
1      828
15     665
16     497
17     451
18     352
19     272
20     226
21     182
22     147
23     128
24     121
25      98
26      97
27      63
28      57
30      56
42      48
29      45
41      42
31      41
34      39
32      37
33      36
43      36
39      33
38      32
36      29
40      28
35      24
46      20
45      20
44      15
37      14
47       9
49       8
48       3
54       1
50       1
52       1
Name: Num_of_Authors, dtype: int64

In [13]:
#freq count "Corresponding_Authors"
freq_count(pubs_df, 'Corresponding_Authors')

Jesse D Bloom              30
Thierry  Mora              29
Antonis  Rokas             26
Dave  Thirumalai           22
Mark  Gerstein             21
                           ..
Takuma  Kobayashi           1
Pauliina  Damdimopoulou     1
Xuequn  Shang               1
Gabi  Kastenmüller          1
Naeha  Subramanian          1
Name: Corresponding_Authors, Length: 39467, dtype: int64

In [14]:
#freq_count Institution
freq_count(pubs_df, 'Institution')

STANFORD UNIVERSITY                                             644
UNIVERSITY OF OXFORD                                            528
UNIVERSITY OF CAMBRIDGE                                         486
UNIVERSITY OF MICHIGAN                                          362
UNIVERSITY OF WASHINGTON                                        344
                                                               ... 
GUIZHOU UNIVERISTY OF CHINA                                       1
GUIZHOU UNIVERSITY                                                1
GUNMA UNIVERSITY GRADUATE SCHOOL OF HEALTH SCIENCES               1
GURDON INSTITUTE - UNIVERSITY OF CAMBRIDGE                        1
†DEPARTMENT OF PSYCHOLOGY, UNIVERSITY OF TURIN, TURIN, ITALY      1
Name: Institution, Length: 18942, dtype: int64

In [15]:
#freq_count Category
freq_count(pubs_df, 'Category')

Neuroscience                              9941
Microbiology                              5128
Bioinformatics                            4987
Genomics                                  3726
Evolutionary Biology                      3630
Genetics                                  3027
Cell Biology                              2794
Biophysics                                2747
Ecology                                   2201
Biochemistry                              2069
Molecular Biology                         1856
Cancer Biology                            1738
Developmental Biology                     1673
Plant Biology                             1653
Immunology                                1522
Systems Biology                           1431
Bioengineering                            1192
Epidemiology                              1006
Animal Behavior And Cognition              857
Physiology                                 720
Pharmacology And Toxicology                533
Synthetic Bio

In [16]:
#freq count Jounral
freq_count(pubs_df, 'Journal')

Plos One                                                                               3408
Elife                                                                                  2566
Scientific Reports                                                                     2479
Nature Communications                                                                  2099
Proceedings Of The National Academy Of Sciences                                        1369
                                                                                       ... 
International Journal Of Breast Cancer                                                    1
International Journal Of Behavioral Nutrition And Physical Activity                       1
International Journal Of Astrobiology                                                     1
International Journal Of Artificial Intelligence And Machine Learning                     1
Современные Проблемы Науки И Образования (Modern Problems Of Science And Educati

In [17]:
#freq count Preprint_Date
freq_count(pubs_df, 'Preprint_Date')

2019-05-24    139
2019-07-02    127
2019-09-05    122
2019-06-21    120
2018-12-13    117
             ... 
2015-05-26      1
2015-05-12      1
2014-07-18      1
2014-01-06      1
2020-11-23      1
Name: Preprint_Date, Length: 2368, dtype: int64

In [21]:
#freq count Published_date
freq_count(pubs_df, 'Published_Date')

2020-11-30    119
2020-11-10    114
2020-09-28    114
2020-05-19    113
2020-10-27    110
             ... 
2016-04-16      1
2016-03-26      1
2016-04-03      1
2016-02-27      1
2019-12-22      1
Name: Published_Date, Length: 2216, dtype: int64

In [34]:
#freq count by category
tmp = pubs_df
tmp['Time_Month'] = pubs_df.Published_Date.subtract(pubs_df.Preprint_Date).divide(np.timedelta64(1, 'M')).where(pubs_df.Published_Date > pubs_df.Preprint_Date)
#tmp.Time_Month = tmp.
tmp.groupby(['Category'], as_index=True).Time_Month.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,29.0,9.804351,5.299783,0.985647,5.125362,8.903674,13.536212,20.370028
Animal Behavior And Cognition,856.0,7.360031,5.203498,0.164274,4.041151,6.078154,9.306146,55.524754
Biochemistry,2060.0,5.388536,4.071484,0.032855,2.718742,4.402554,6.833816,33.446272
Bioengineering,1188.0,6.111285,3.97831,0.032855,3.285488,5.355346,7.893386,23.195548
Bioinformatics,4975.0,7.928177,5.713281,0.032855,4.17257,6.570977,10.217869,58.514549
Biophysics,2728.0,5.852493,4.438419,0.032855,3.022649,4.829668,7.523768,55.29477
Cancer Biology,1732.0,7.3303,4.900762,0.06571,4.065792,6.275283,9.527916,40.608637
Cell Biology,2789.0,6.81333,4.71981,0.032855,3.646892,5.815314,8.706544,47.442453
Clinical Trials,59.0,7.093871,4.517703,0.525678,3.811167,5.256781,9.495062,19.417236
Developmental Biology,1671.0,7.068244,4.952817,0.032855,3.876876,5.946734,8.837964,42.054252


In [None]:
#Time duration by Category - Journal
tmp.groupby(['Category','Journal'], as_index=True).Time_Month.describe().to_csv('data/Category-Journal_duration.csv')

In [33]:
#Time duration by Journal - Category
tmp.groupby(['Journal','Category'], as_index=True).Time_Month.describe().to_csv('data/Journal-Category_duration.csv')

#### Check

In [None]:
'''
                      columns=["pre_DOI", "pub_DOI",
                               "Title", "Authors", "Corresponding_Authors",
                               "Institution",
                               "Category", "Journal", "Preprint_Date", "Published_Date"])
'''
#assert get_total(f'{base_url}{query_type}{server}{start_interval}{end_interval}') == pubs_df.shape[0]
"Total number of papers published and the length of the dataframe does match."

----

### Detailed analysis of published articles for a given timeframe

In [1]:
from multipledispatch import dispatch
@dispatch(pd.DataFrame, int, str)
def get_values(df: pd.DataFrame, row: int, col: str):
    return get_values(df, [row], [col])

@dispatch(pd.DataFrame, int, list)
def get_values(df: pd.DataFrame, row: int, cols: List[str]):
    return get_values(df, [row], cols)

@dispatch(pd.DataFrame, list, str)
def get_values(df: pd.DataFrame, rows: List[int], col: str):
    return get_values(df, rows, [col])

@dispatch(pd.DataFrame, list, list)
def get_values(df: pd.DataFrame, rows: List[int], cols: List[str]) -> List[List[str]]:
    return [[df.at(row, col) for col in cols] for row in rows]

@dispatch(pd.DataFrame, str)
def get_values(df: pd.DataFrame, col: str):
    return df[col]

@dispatch(pd.DataFrame, list)
def get_values(df: pd.DataFrame, cols: List[str]):
    return df[cols]

ModuleNotFoundError: No module named 'multipledispatch'

                                                                                                                                                                          ### BioRvix Service

In [None]:
#from tqdm import tqdm_notebook, tnrange
from tqdm.notebook import tqdm_notebook
import time
import math

#variables
#https://api.biorxiv.org/pubs/biorvix/2018-08-21/2018-08-28
#https://api.biorxiv.org/details/biorxiv/10.1101/759530
#https://api.biorxiv.org/details/biorvix/10.1101/759530
base_url: str = 'https://api.biorxiv.org'
query_type: str = 'details'
server: str ='biorxiv'
url: str = f'{base_url}/{query_type}/{server}'
path: str = f'data/prepub-{query_type}-{server}-{datetime.now()}'
step = 100
#doi = ['10.1101/856302']
doi = pubs_df.DOI
prepub_df = None

'''
for item in tqdm_notebook(range(0, total, step), desc='get_prepublish_data', 
                 total=math.ceil(total/step)):
    #time.sleep(0.005)
    results = get_data(None, 
                       url, 
                       doi[item:item+step])
    tmp = list(results)
    prepub_df = query_to_df([r for r in tmp], 
                            ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "published"],
                            range(item, item + (len(tmp) * step), step),
#                            range(item, (item + ((len(tmp[counter]) - 1) * step)) * 10, step),
                            ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "pub_DOI"], False)
    #prepub_df = create_prepublish_df(prepub_df)
    prepub_df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
'''

#create directory
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
total = len(doi)
args = [(path, url, doi, item, step, True) for item in range(0, total, step)]
#print(f'args: {len(args)}\n{args}')
print(url, f'total doi: {total} iter: {total/step}')
#tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', max_workers=5, total=len(args))
tq.thread_map(lambda p: process_doi_data(*p), args, desc='process_doi_data', total=len(args))


In [None]:
prepub_df = create_prepublish_df(pd.read_parquet(pathlib.Path(path)))

prepub_df

In [None]:
print('Published data:\n',[(name, pubs_df[name].isnull().sum()) for name in pubs_df.columns if pubs_df[name].isnull().values.any()])
print('Prepublish data:\n',[(name, prepub_df[name].isnull().sum()) for name in prepub_df.columns if prepub_df[name].isnull().values.any()])

In [None]:
combined_df = pd.merge(pubs_df,prepub_df, how="outer", on=['DOI','Title', 'Authors', 'Corresponding_Authors', 'Institution', 'Category', 'Num_of_Authors', 'pub_DOI'])
#combined_df = pd.merge(pubs_df,prepub_df, how="right", on=['DOI','Title', 'Authors', 'Corresponding_Authors', 'Institution', 'Category', 'Num_of_Authors'])
combined_df['Time_month'] = (combined_df['Published_Date'] - combined_df['Preprint_Date']) / np.timedelta64(1, 'M')
combined_df['Keep'] = combined_df.Published_Date > combined_df.Preprint_Date
#ref = combined_df.set_index(['pub_DOI'], inplace=False)
combined_df.head(1)
#display(combined_df.groupby('DOI').count())

In [None]:
print([(index, value) for index, value in enumerate(combined_df.groupby(['DOI'])['Time_month']) ])


In [None]:
# checking for NA
print([(name, combined_df[name].isnull().sum()) for name in combined_df.columns if combined_df[name].isnull().values.any()])
print([(name, combined_df.groupby([name], dropna=False).size) for name in combined_df.columns if (combined_df.groupby([name], dropna=False)[name].count() > 0).any()])
print([(name, combined_df.groupby([name], dropna=True).size) for name in combined_df.columns if (combined_df.groupby([name], dropna=False)[name].count() > 0).any()])

In [None]:
#%%capture cap
display(prepub_df.shape, pubs_df.shape, combined_df.shape, combined_df.groupby(['pub_DOI'], as_index=True).count().shape)
#test = combined_df.groupby(['DOI', 'pub_DOI', 'Category', 'Institution', 'Corresponding_Authors', 'Journal', 'Preprint_Date', 'Published_Date', 'Authors', 'Num_of_Authors', 'Title', 'Date', 'Version'])
test = combined_df.groupby(['DOI'], as_index = True)
#display(test.iloc[test.Title.nunique().ne(1),:])
#display(test.Title.nunique().ne(1).index)
#display(test.Title.nunique().ne(1))
#key = test.Title.nunique().ne(1)
#display(ref.index.isin(key))
#display(ref[ref.index.isin(key)])
#display(test.Title.nunique().ne(1).shape)
#test.Title.indices.
#display(test.Title.indices)
#df.at only get signal value so need to use list comprehension to get multiple columns in a particular row
#print(test.Title.indices['10.1128/IAI.00353-19'])
#display(combined_df .iloc[[1826, 1827]])
#display(combined_df)
combined_df.groupby(['pub_DOI'], as_index=False).Title.unique()

In [None]:
combined_df.groupby(['Category', 'Keep'], as_index=True).Time_month.describe()

**NB:** There are some issues with the above table since there are 29 publications that have more than one enteries.

In [None]:
df = combined_df.loc[combined_df.reset_index().groupby(['pub_DOI'])['Time_month'].idxmax()]


In [None]:
display(df.groupby(['Category']).Time_month.describe())

In [None]:
combined_df.groupby(['Institution'], as_index=True).pub_DOI.describe().to_csv("data/general.csv")

In [None]:
combined_df.groupby(['Institution', 'Category'], as_index=True).Time_month.describe()

In [None]:
reduce_df = combined_df[combined_df.Keep == True]
reduce_df.groupby(['Category', 'Keep'], as_index=True).Time_month.describe()

In [None]:
print(pubs_df.shape)
print(prepub_df.shape)
print(test.count().shape)
print((test.Title.nunique() - 1 ).shape)

In [None]:
test.filter(lambda x: x['Title'].nunique() > 1).to_csv('data/title-change.csv')
prepub_df.to_csv('data/pre&pub-data.csv')
pubs_df.to_csv('data/pub_data.csv')

# Junk Code
Please disregard the codes below.

In [None]:
'''
for item in tqdm_notebook(range(0, total, step), desc='get_prepublish_data', 
                 total=math.ceil(total/step)):
    #time.sleep(0.005)
    results = get_data(None, 
                       url, 
                       doi[item:item+step])
    tmp = list(results)
    prepub_df = query_to_df([r for r in tmp], 
                            ["doi", "title", "authors", "author_corresponding", "author_corresponding_institution", "date", "version", "type", "category", "published"],
                            range(item, item + (len(tmp) * step), step),
#                            range(item, (item + ((len(tmp[counter]) - 1) * step)) * 10, step),
                            ["DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Date", "Version", "Type", "Category", "pub_DOI"], False)
    #prepub_df = create_prepublish_df(prepub_df)
    prepub_df.to_parquet(pathlib.Path(f'{path}/{datetime.utcnow().timestamp()}.parquet'))
'''



In [None]:
# "local" variables
url: str = f'{base_url}/{query_type}/{server}/{start_interval}/{end_interval}'
path: str = f'pub-{query_type}-{server}-{start_interval}!{end_interval}-{datetime.datetime.now()}'
step = 100
pubs_df = None

#finding the number of records for a given time period

pathlib.Path(path).mkdir(parents=True, exist_ok=True)
print(url)
results = get_data(None, 
         url, 
         range(0, get_total(url), 100))

tmp = list(results)
for item in range (0, get_total(url), step):
    result_list = tmp[item:(item+step)]
    pubs_df = query_to_df(result_list, 
                          ["preprint_doi", "published_doi", "preprint_title", "preprint_authors", "preprint_author_corresponding", "preprint_author_corresponding_institution", "preprint_category", "published_journal", "preprint_date", "published_date"],
                          range(item, len(result_list) * step, step),
                          ["DOI", "pub_DOI", "Title", "Authors", "Corresponding_Authors", "Institution", "Category", "Journal", "Preprint_Date", "Published_Date"])
    pubs_df.to_parquet(pathlib.Path(f'{path}/{datetime.datetime.utcnow().timestamp()}.parquet'))
pubs_df = pd.read_parquet(pathlib.Path(path))
pubs_df = create_published_df(pubs_df)

display(pubs_df)

In [None]:
def get_publisher_prefix(doi: pd.Series) -> pd.Series:
    return doi.apply(lambda x: x.split("/")[0])

query_type='/publisher'

def get_publisher_data(url)
get_data(None,f'{base_url}{query_type}'
         get_publisher_prefix(pubs_df.pub_DOI).unique())

**Journal API**<br>
https://www.nature.com/opensearch/<br>
https://www.biorxiv.org/content/10.1101/339747v4<br>
https://www.biorxiv.org/content/10.1101/339747v4.full.pdf<br>
https://api.biorxiv.org/details/biorxiv/10.1101/099697


In [None]:
journal_list = []

# `license`,`abstract`, and `server` are excluded from the metrics. 
for journal in json_info["collection"]:
    journal_list.append([journal["doi"], journal["title"], journal["authors"],
                         journal["author_corresponding"], 
                         journal["author_corresponding_institution"],
                         journal["date"], journal["version"], journal["type"],
                         journal["category"], journal["jatsxml"], journal["published"]])
    

In [None]:
journal_df = pd.DataFrame(data=journal_list,
                         columns=["DOI", "Title", "Authors", "Corresponding Authors",
                                  "Institution",
                                  "Date", "Version", "Type", "Category", "Xml", "Published"])
journal_df.head()