# Getting Full Papers

In [9]:
import urllib
import requests
import os
import math
import json
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

import igemutils as igem

email = 'ivalexander13@berkeley.edu' # enter ur email pls

## Create Subsets

In [10]:
rxns_pmid = igem.get_json("./get_metadata_pmidset/unpaywall_metadata_pmids.json")
oa_subset, non_oa_subset = igem.generate_subset(rxns_pmid, 'is_oa', [True], return_complement=True)

Count of is_oa being [True]: 46843.
Count of is_oa not matching [True]: 49375.


# 1. Elsevier!!

## Splitting Data

In [11]:
n_splits = 6 # fill this in!!

elsevier_subset = igem.generate_subset(non_oa_subset, 'publisher', ['Elsevier', 'Elsevier BV'])[0]
igem.split_dict(elsevier_subset, n_splits, 'get_fullpapers/elsevier/elsevier_subset_split')

Count of publisher being ['Elsevier', 'Elsevier BV']: 23267.
Saved part 1.
Saved part 2.
Saved part 3.
Saved part 4.
Saved part 5.
Saved part 6.
All done!


## Run this in parallel!!

```python get_elsevier.py <num>```

Where <num> is the split# generated from the cell above.
    
For convenience: just paste these in (in separate terminals!)

In [12]:
for n in range(1, n_splits + 1):
    print(f"python get_elsevier.py {n}")
    print("")
    

python get_elsevier.py 1

python get_elsevier.py 2

python get_elsevier.py 3

python get_elsevier.py 4

python get_elsevier.py 5

python get_elsevier.py 6



# 2. Open Access

## Splitting Data

In [13]:
n_splits = 6 # fill this in!!
igem.split_dict(oa_subset, n_splits, 'get_fullpapers/oa/oa_subset_split')

Saved part 1.
Saved part 2.
Saved part 3.
Saved part 4.
Saved part 5.
Saved part 6.
All done!


In [14]:
oa_split1 = igem.get_json('get_fullpapers/oa/oa_subset_split1.json')

In [15]:
len(oa_split1)

7808

### Convert PDF to Text

In [16]:
def get_bestlink(pmid: str, oa_subset: dict):
    try:
        out = oa_subset[pmid]['best_oa_location']['url_for_pdf']
    except:
        return False
    return out

In [59]:
oa_split1['12702265']['best_oa_location']['url_for_pdf']

'https://academic.oup.com/femsyr/article-pdf/2/4/481/17936836/2-4-481.pdf'

In [72]:
try:
    response = requests.get('https://academic.oup.com/femsyr/article-pdf/2/4/481/17936836/2-4-481.pdf')    
except (requests.exceptions.RequestException,
        ConnectionResetError) as e:
    print("AAAAA")

AAAAA


In [79]:
print("bbbbb", end='')
print("\raaa")

aaabb


In [32]:
with open('get_fullpapers/oa/tmp.pdf', 'wb') as fd:
    fd.write(response.content)

In [29]:
import subprocess
def pdf_to_text(filepath):
    print('Getting text content for {}...'.format(filepath))
    process = subprocess.Popen(['pdf2txt.py', filepath], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    stdout, stderr = process.communicate()

    if process.returncode != 0 or stderr:
        raise OSError('Executing the command for {} caused an error:\nCode: {}\nOutput: {}\nError: {}'.format(filepath, process.returncode, stdout, stderr))
    print('Done.')
    return stdout.decode('utf-8')

fulltext_str = pdf_to_text('tmp.pdf')

Getting text content for tmp.pdf...
Done.


In [31]:
fulltext_str.replace('\n', ' ')

'A u  t  h o r    M a n u s c r i p  t  A u  t  h o r    M a n u s c r i p  t  A u  t  h o r    M a n u s c r i p  t  A u t h o r    M a n u s c r i p  t  HHS Public Access Author manuscript J Ind Microbiol Biotechnol. Author manuscript; available in PMC 2020 April 01.  Published in final edited form as: J Ind Microbiol Biotechnol. 2019 October ; 46(9-10): 1365–1370. doi:10.1007/s10295-019-02199-x.  Production of semi-biosynthetic nepetalactone in yeast  John M. Billingsley†, Jose L. Anguiano†, Yi Tang†,‡,* †Department of Chemical and Biomolecular Engineering, University of California, Los Angeles,  CA 90095, United States  ‡Department of Chemistry and Biochemistry, University of California, Los Angeles, CA 90095,  United States  Abstract  Microbial-based production of natural products provides a promising alternative to synthetic  production and isolation from the native producer. The recently discovered NEPS1 cyclase/oxidase  completes the biosynthetic pathway to nepetalactone, a bio

In [33]:
os.remove('tmp.pdf')

In [46]:
import PyPDF2
import io

outtext = ''
with io.BytesIO(response.content) as f:
    reader = PyPDF2.PdfFileReader(f)
    for page in reader.pages:
        outtext += page.extractText()

### Command for bash
Make sure the pwd is /brenda_data_collection/, then paste the following command:

In [80]:
for i in range(1, 7):
    print(f"python fetch_oa.py {i} &", end=' ')

python fetch_oa.py 1 & python fetch_oa.py 2 & python fetch_oa.py 3 & python fetch_oa.py 4 & python fetch_oa.py 5 & python fetch_oa.py 6 & 

***

## [1.] DO NOT RUN: Elsevier API and Loop
Code below is the rough basis for elsevier.py

Setting up the API

In [None]:
# Load configuration
con_file = open("elsevier_api/config.json")
config = json.load(con_file)
con_file.close()

# Initialize client
client = ElsClient(config['apikey'])
client.local_dir = "./get_fullpapers_pmidset/"

In [None]:
# Function: Get paper
def get_paper(doi: str):
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi = doi)
    if doi_doc.read(client):
        return doi_doc.data
    else:
        return False

Main Loop (taken and modified from elsevier_api).

**NOTE: MUST ENABLE BERKELEY VPN FOR THIS TO WORK!**

In [None]:
# Test to see if it works
get_paper('10.1016/0031-9422(73)80493-5');

In [None]:
# Output file = doi to full_text dict dictionary (in json)
def loop_elsevier(elsevier_subset, outfile):
    json_file = outfile
    if os.path.isfile(json_file):
        with open(json_file, 'r') as fp:
            elsevier_fulltexts = json.load(fp)
            fp.close()
    else:
        elsevier_fulltexts = {}

    # hyperparam (-1 if max)
    max_calls = -1

    # Stats
    calls = 0
    fails = 0
    not_elsevier = 0
    successes_or_found = 0
    queries = 0

    try:
        # Looping through quantify_dataset output json.
        for pmid, metadata in elsevier_subset.items():
            doi = metadata['doi']
            doi_url = f"https://doi.org/{doi}"

            # dont go over max calls. (-1 if infinite)
            if calls == max_calls or (queries == -1):
                print("Query limit reached.")
                break
            else:
                calls += 1

            # checks if paper has been successfully fetched before
            if pmid in elsevier_fulltexts:
                successes_or_found += 1
                print(f"## Call {calls} found.")
                continue

            fullpaper = get_paper(doi)
            if (fullpaper):
                elsevier_fulltexts[pmid] = fullpaper
                successes_or_found += 1
                queries += 1
                print(f"Call {calls} success.")
            else:
                fails += 1
                print (f"##### Call {calls} failed: {fails}. DOI: {doi_url}.")

    except KeyboardInterrupt:
        pass

    # save to file
    with open(json_file, 'w') as fp:
        json.dump(elsevier_fulltexts, fp)
        # vary: alter frequency of file save
        fp.close()

    # Print Stats
    print("")
    print("###### STATS ######")
    print(f"Total calls: {calls}")
    print(f"Total number of queries: {queries}")
    print(f"Total number of Elsevier papers: {calls - not_elsevier}")
    print(f"Number of Non-Elsevier papers skipped: {not_elsevier}")
    print(f"Number of fetch failures: {fails}")
    print(f"Papers in storage: {len(elsevier_fulltexts)}")
    print(f"% of success: {successes_or_found / (calls-not_elsevier) * 100}%")