In [1]:
import subprocess
import pandas as pd
import numpy as np
import re
import os
from pypdf import PdfReader

#!/usr/bin/env python3
import dotenv
dotenv.load_dotenv()

import argparse
import os
from requests import Session
from typing import Generator, Union

import urllib3
urllib3.disable_warnings()


In [5]:
S2_API_KEY = 'XBbGPcpPXB5CH0aLSu6BO5fww06zAusz6dFGXqj0'

def get_paper(session: Session, paper_id: str, fields: str = 'paperId,title', **kwargs) -> dict:
    params = {
        'fields': fields,
        **kwargs,
    }
    headers = {
        'x-api-key': S2_API_KEY,
    }

    with session.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}', params=params, headers=headers) as response:
        response.raise_for_status()
        return response.json()


def download_pdf(session: Session, url: str, path: str, user_agent: str = 'requests/2.0.0'):
    # send a user-agent to avoid server error
    headers = {
        'user-agent': user_agent,
    }

    # stream the response to avoid downloading the entire file into memory
    with session.get(url, headers=headers, stream=True, verify=False) as response:
        # check if the request was successful
        response.raise_for_status()

        if response.headers['content-type'] != 'application/pdf':
            raise Exception('The response is not a pdf')

        with open(path, 'wb') as f:
            # write the response to the file, chunk_size bytes at a time
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)


def download_paper(session: Session, paper_id: str, directory: str = 'papers', user_agent: str = 'requests/2.0.0') -> Union[str, None]:
    paper = get_paper(session, paper_id, fields='paperId,isOpenAccess,openAccessPdf')

    # check if the paper is open access
    if not paper['isOpenAccess']:
        return None

    paperId: str = paper['paperId']
    pdf_url: str = paper['openAccessPdf']['url']
    pdf_path = os.path.join(directory, f'{paperId}.pdf')

    # create the directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)

    # check if the pdf has already been downloaded
    if not os.path.exists(pdf_path):
        download_pdf(session, pdf_url, pdf_path, user_agent=user_agent)

    return pdf_path


def download_papers(paper_ids: list[str], directory: str = 'papers', user_agent: str = 'requests/2.0.0') -> Generator[tuple[str, Union[str, None, Exception]], None, None]:
    # use a session to reuse the same TCP connection
    with Session() as session:
        for paper_id in paper_ids:
            try:
                yield paper_id, download_paper(session, paper_id, directory=directory, user_agent=user_agent)
            except Exception as e:
                yield paper_id, e


def semanticscholardownload(paper_ids,download_paper_path):
    for paper_id, result in download_papers(paper_ids, directory=download_paper_path, user_agent='requests/2.0.0'):
        if isinstance(result, Exception):
            return (f"Failed to download '{paper_id}': {type(result).__name__}: {result}")
        elif result is None:
            return (f"'{paper_id}' is not open access")
        else:
            return (f"Downloaded '{paper_id}' to '{result}'")



In [6]:
paper_list_csv_path = '/Users/vivianhuang/Desktop/CHG/coastal%20surge_texas_south_model.csv'
paper_list_df = pd.read_csv(paper_list_csv_path)

new_paper_list = paper_list_df.copy()
new_paper_list_len = len(new_paper_list)

In [7]:
download_paper_dir = '/Users/vivianhuang/Desktop/CHG/papers/'
search_in_paper = ['bathymetry gradient','bottom stress','atmospheric pressure gradient','tides','radiation stress']
for this_keyword in search_in_paper:
    new_paper_list[this_keyword]=np.nan

new_paper_list['pdf open access']=np.nan

In [8]:
for i in range(new_paper_list_len):
    print(i)
    output = semanticscholardownload([new_paper_list['paperId'][i]],download_paper_dir)
    #'python simple.py -d papers '+new_paper_list['paperId'][i]#'python simple.py -d papers 649def34f8be52c8b66281af98ae884c09aef38b'
    
    if ' is not open access' in output:
        print('This pdf is not open access.')
        new_paper_list['pdf open access'][i] = False
        continue

    if 'Failed to download' in output:
        print('Failed to download')
        new_paper_list['pdf open access'][i] = False
        continue

    new_paper_list['pdf open access'][i] = True
    print(output)
    # open the pdf file
    download_paper_path = download_paper_dir +new_paper_list['paperId'][i]+'.pdf'
    

    reader = PdfReader(download_paper_path)
    text = ""
    for page in reader.pages:
        text = page.extract_text()   
        res_search = re.search('references\n', text, re.IGNORECASE)
        if  type(res_search).__name__ != 'NoneType':
            text=re.split('references\n',page.extract_text(),flags=re.IGNORECASE)[0]

        for this_keyword in search_in_paper:
            res_search = re.search(this_keyword, text, re.IGNORECASE)
            if type(res_search).__name__ != 'NoneType':
                new_paper_list[this_keyword][i]=True 

        
    
    os.remove(download_paper_path)

0
This pdf is not open access.
1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_paper_list['pdf open access'][i] = False


This pdf is not open access.
2
This pdf is not open access.
3
Failed to download
4
This pdf is not open access.
5
Downloaded 'b4877795b435e69452af8b35fa3bf1086585e7db' to '/Users/vivianhuang/Desktop/CHG/papers/b4877795b435e69452af8b35fa3bf1086585e7db.pdf'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_paper_list[this_keyword][i]=True


6
This pdf is not open access.
7
Downloaded '13b5c916ccbd7b5bf0d061f1b90b9f88e773bd1f' to '/Users/vivianhuang/Desktop/CHG/papers/13b5c916ccbd7b5bf0d061f1b90b9f88e773bd1f.pdf'
8
Downloaded '2ca75c2aa3a7a77f35126103f9b9875abb29ebf9' to '/Users/vivianhuang/Desktop/CHG/papers/2ca75c2aa3a7a77f35126103f9b9875abb29ebf9.pdf'
9
This pdf is not open access.
10
This pdf is not open access.
11
This pdf is not open access.
12
This pdf is not open access.
13
Downloaded '3a050df72cee3db45c0da0079cbe0124a295d08c' to '/Users/vivianhuang/Desktop/CHG/papers/3a050df72cee3db45c0da0079cbe0124a295d08c.pdf'
14
This pdf is not open access.
15
Failed to download
16
Downloaded '64e5521f09fcc6107b0e099397738f92be8c81cd' to '/Users/vivianhuang/Desktop/CHG/papers/64e5521f09fcc6107b0e099397738f92be8c81cd.pdf'
17
This pdf is not open access.
18
This pdf is not open access.
19
This pdf is not open access.
20
This pdf is not open access.
21
Downloaded '3f98dff78c97ddff10996c478f4ab69d8ea16b8e' to '/Users/vivianhuang/De

In [None]:
new_paper_list.to_csv(download_paper_path[:-4]+'.csv', sep=',', index=False,header=True)