In [None]:
import pandas as pd
import requests

In [None]:
#API keys
api_springer = "" #full text for open-access articles in XML format (api.springernature.com/openaccess/jats/doi/[DOI]?api_key=[API])
api_elsevier = "" #full text for any article I have access to in XML format (https://api.elsevier.com/content/article/doi/[DOI]?APIKey=[API]?view=FULL)
api_wiley = "" #full text, but PDF download only.
api_gpt = ""
#BioC API: offers PMC OA and Author Manuscript Collection in XML format via PubMed ID or PMC ID (https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_[format]/[ID]/[encoding])
api_semanticscholar = ""

In [None]:
orcid_balskus = "https://orcid.org/0000-0001-5985-5714"

In [None]:
# In the future, I can filter using multiple ORCIDs by using the '|' operator. (Up to 50 values)

def build_author_works_url(orcid):
    # specify endpoint
    endpoint = 'works'

    # build the 'filter' parameter
    filters = (
      f'author.orcid:{orcid}',
      'type:article', #excludes book-chapter, dissertation, book, dataset, paratext, other, reference-entry, report, peer-review, standard, editorial, erratum, grant, letter
      'is_paratext:false' #excludes paratext
    )

    # put the URL together
    return f'https://api.openalex.org/{endpoint}?filter={",".join(filters)}&mailto=kl4898@stern.nyu.edu'


In [None]:
def get_publications(orcid):
    # get the URL
    url_with_cursor = build_author_works_url(orcid) + "&cursor={}"

    # initialize
    cursor = '*'
    df = pd.DataFrame()

    while cursor:
        url = url_with_cursor.format(cursor)
        response = requests.get(url).json()
        data = pd.DataFrame(response['results'])
        df = pd.concat([df, data])
        cursor = response['meta']['next_cursor']

    return df

In [None]:
from bs4 import BeautifulSoup

In [None]:
balskus = get_publications(orcid_balskus)

In [None]:
balskus.reset_index(inplace=True)

In [None]:
def get_primary_location_name(item):
    try:
        return item['source']['display_name']
    except:
        return None

In [None]:
def get_primary_location_publisher(item):
    try:
        return item['source']['host_organization_lineage_names'][-1]
    except:
        return None

In [None]:
balskus['journal'] = balskus.primary_location.apply(get_primary_location_name)
balskus['publisher'] = balskus.primary_location.apply(get_primary_location_publisher)

In [None]:
balskus

In [None]:
balskus.columns

In [None]:
balskus[balskus['publisher']=='Wiley']

In [None]:
from openai import OpenAI

In [None]:
client = OpenAI(api_key=api_gpt)

In [None]:
completion = client.chat.completions.create(
  model="gpt-4-1106-preview",
  response_format={"type":"json_object"},
  messages=[
    {"role": "system", "content": "You are supposed to pretend to not understand anything the user says."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message.content)

Springer API Test

In [None]:
# http://api.springernature.com/metadata/pam/doi/10.1007/s11276-008-0131-4?api_key=yourKeyHere

http://api.springernature.com/openaccess/jats/doi/10.1038/s41586-019-0894-z?api_key=c1ca0f1fd4a20828a0b701cbc86d486a

In [None]:
url = "http://api.springernature.com/openaccess/jats/doi/10.1038/s41586-022-04444-3?api_key=c1ca0f1fd4a20828a0b701cbc86d486a"

response = requests.get(url)

In [None]:
import xml.etree.ElementTree as ET
from lxml import etree

In [None]:
def get_depth(element):
    depth = 0
    while element.getparent() is not None:
        depth += 1
        element = element.getparent()
    return depth

In [None]:
root = etree.fromstring(response.content)

In [None]:
secs = list(root.iter('sec'))

In [None]:
for i, sec in enumerate(secs):
    depth = get_depth(sec)
    print(f"Section {i+1}: Depth = {depth}, Text = {sec.text}, Attributes = {sec.attrib}")
    concatenated_text = sec.text or ''
    
    for child in sec.iterdescendants():
            concatenated_text += child.text or ''
            concatenated_text += child.tail or ''
            concatenated_text += ' '

    print(f"Concatenated Text up to next title of same depth: {concatenated_text}")

In [None]:
balskus.publisher.unique()

In [None]:
balskus.journal.unique()

In [None]:
discussion = titles[9]

In [None]:
methods = titles[10]

In [None]:
sibling = methods.getnext()

In [None]:
depth = get_depth(methods)

In [None]:
sibling = discussion.getnext()
while True:
    print(sibling, sibling.tag, get_depth(sibling), sibling.text)
    sibling = sibling.getnext()
    if sibling is None:
        break


In [None]:
methods.getnext().text

In [None]:
import fitz

In [None]:
doc = fitz.open("nchembio.1890.pdf")
out = open("nchembio.1890.txt", "wb")
for page in doc:
    text = page.get_text().encode("utf8")
    out.write(text)
    out.write(bytes((12,)))  # form feed character
out.close()


In [None]:
# Set up the URL and the headers
url = 'https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111/1467-923X.12168'
headers = {'Wiley-TDM-Client-Token': api_wiley}

# Send the GET request
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Write the response headers to a file
    with open('12168-headers.txt', 'w') as file:
        for key, value in response.headers.items():
            file.write(f"{key}: {value}\n")
    
    # Write the response content to a PDF file
    with open('12168.pdf', 'wb') as file:
        file.write(response.content)
else:
    print(f"Failed to retrieve the data: {response.status_code}")


In [None]:
# Set up the URL and the headers
url = 'https://api.semanticscholar.org/datasets/v1/release/2024-01-02/'
headers = {'x-api-key': api_semanticscholar}

# Send the GET request
response = requests.get(url, headers=headers)

In [None]:
response.json()