In [32]:
import pandas as pd
import numpy as np
import requests
import json
#During the creation of this database, we heavily relied on the code of John Bohannon, from Science. http://datadryad.org/bitstream/handle/10255/dryad.114711/Sci-Hub.html?sequence=2

In [2]:
months = ("sep2015", "oct2015", "nov2015", "dec2015", "jan2016", "feb2016")

In [3]:
%mkdir "scihub_data_temp"

In [39]:
journal_DOIs = pd.read_csv("publisher_DOI_prefixes.csv", index_col = 0)
#journal_DOIs.head()

In [9]:
def process_data(month):
    
    # load the file as a dataframe
    path = "scihub_data/"
    filename = month + ".tab"
    with open(path + filename, "r") as f:
        data = pd.read_table(f)

    # this is the format of the columns
    data.columns = ["date","doi","IP_code","country","city","coords"]

    # create a few more useful columns
    data[["latitude", "longitude"]] = data.coords.str.split(",", expand = True)
    data["prefix"] = data.dropna(subset = ["doi"]).doi.apply(lambda x: x.split("/")[0])

    # group by DOI prefix and count total downloads for each
    publishers = data.dropna(subset = ["prefix"]).groupby("prefix").count()
    publishers = publishers.sort_values(by = "date", ascending = False).date
    publishers = publishers.reset_index()
    publishers.columns = ["prefix","downloads"]

    # translate those DOI prefixes into publisher names using the CrossRef data
    data_publishers = pd.merge(publishers, journal_DOIs[["Prefix","Name"]],
                               left_on = "prefix", right_on = "Prefix", how = "left")
    data_publishers[["prefix","downloads","Name"]].to_csv("scihub_data_temp/%s_publishers.csv" %(month))

    # calculate the 100 most downloaded DOIs of the month
    top100_doi = data.groupby("doi").count().sort_values(by = "date", ascending = False)[:100].date
    top100_doi.name = "downloads"
    top100_doi.to_csv("scihub_data_temp/%s_top100_doi.csv" %month, header = "downloads")


In [10]:
for month in months:
    print(month)
    process_data(month)

sep2015
oct2015
nov2015
dec2015
jan2016
feb2016


In [11]:
publishers_by_month = [pd.read_csv("scihub_data_temp/" + i + "_publishers.csv") for i in months]
for i in publishers_by_month:
    print(len(i))

1095
1216
1039
1166
1223
1405


In [12]:
### DOI
all_papers = pd.concat([pd.read_csv("scihub_data_temp/" + i + "_top100_doi.csv") for i in months])

In [16]:
all_papers.groupby("doi").count().sort_values(by = "downloads",ascending = False).shape

(477, 1)

In [21]:
all_papers.head()

Unnamed: 0,doi,downloads
0,10.1007/978-1-4419-9716-6_11,7988
1,10.1056/NEJMoa1402121,6063
2,10.1116/1.4904970,2987
3,10.1103/PhysRevB.63.224204,2890
4,10.4028/www.scientific.net/AMM.7-8.159,2266


In [23]:
# Getting the links to use in the CrossRef API. 
query = 'http://api.crossref.org/works?query='
urls = []
for item in all_papers['doi']:
    urls.append(query + item + '&rows=1')

In [36]:
# Getting the data from the CrossRef API. 
article = {}
scihublist = []
for item in urls:
    try:
        resp = requests.get(url=item)
        data = json.loads(resp.text)
        article['title'] = data['message']['items'][0]['title']
        article['url'] = data['message']['items'][0]['URL']
        article['author'] = data['message']['items'][0]['author'][0]['family'] + ' ' + data['message']['items'][0]['author'][0]['given']
        article['journal'] = data['message']['items'][0]['container-title']
        article['date'] = data['message']['items'][0]['published-print']['date-parts']
        article['publisher']=data['message']['items'][0]['publisher']
        article['reference_count']=data['message']['items'][0]['reference-count']
        article['score']=data['message']['items'][0]['score']
        scihublist.append(article)
        article = {}
    except: 
        continue


In [38]:
#The data. 
scihublist

[{'author': u'Buschmann Claas T.',
  'date': [[2012, 9]],
  'journal': [u'Forensic Science, Medicine, and Pathology'],
  'publisher': u'Springer Nature',
  'reference_count': 0,
  'score': 1.506875,
  'title': [u'Alberto M. Marchevsky, Mark R. Wick (eds.): Evidence based pathology and laboratory medicine, 1st edition'],
  'url': u'http://dx.doi.org/10.1007/s12024-011-9285-y'},
 {'author': u'Rosner F.',
  'date': [[1996, 5, 27]],
  'journal': [u'Archives of Internal Medicine'],
  'publisher': u'American Medical Association (AMA)',
  'reference_count': 0,
  'score': 1.0939785,
  'title': [u'Physician-assisted suicide'],
  'url': u'http://dx.doi.org/10.1001/archinte.156.10.1116'},
 {'author': u'Seth Divya',
  'date': [[2015, 10, 23]],
  'journal': [u'Circulation Research'],
  'publisher': u'Ovid Technologies (Wolters Kluwer Health)',
  'reference_count': 0,
  'score': 1.0308855,
  'title': [u'SNOs Differ: Figure.'],
  'url': u'http://dx.doi.org/10.1161/circresaha.115.307551'},
 {'author':

In [40]:
#Dump it to JSON:
with open('data_scihub.txt', 'w') as outfile:
    json.dump(scihublist, outfile)