In [2]:
import os
import re
from dotenv import load_dotenv
from pyzotero import zotero
from slugify import slugify

import pyalex
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders


In [3]:
import requests_cache
# create a local SQLite cache named “openalex_cache.sqlite”
# expire_after can be seconds (e.g. 86400 = 1 day) or None for never
requests_cache.install_cache(
    "openalex_cache",
    backend="sqlite",
    expire_after=None,  # or e.g. 86400 for 1 day
)

In [4]:
def sanitize_filename(name):
    # remove characters not safe for filenames
    name = re.sub(r'[\\/*?:"<>|]', '', name)
    # collapse whitespace
    name = re.sub(r'\s+', '_', name).strip('_')
    return name


In [5]:
# 1. Load .env into os.environ
load_dotenv()  

# 2. Grab credentials
USR_ID   = os.environ["ZOTERO_USER_ID"]
LIB_TYPE = os.environ.get("ZOTERO_LIBRARY_TYPE", "user")  # default to 'user'
API_KEY  = os.environ.get("ZOTERO_API_KEY")               # can be None if you only need local access

# 3. Instantiate the client
#    - For Web API access: local=False (default)
#    - For read-only local access via Zotero's built-in HTTP server: local=True
zot = zotero.Zotero(USR_ID, LIB_TYPE, API_KEY, local=True)


In [6]:
# Open Alex API to enrich metadata
OPEN_ALEX_EMAIL = os.environ.get("OPEN_ALEX_EMAIL")


pyalex.config.email = OPEN_ALEX_EMAIL


In [7]:
target_collection_name = "Kolkowitz Optical Clocks"
col = next((c for c in zot.collections() if c["data"]["name"] == target_collection_name), None)

if not col:
    raise ValueError(f"Collection {target_collection_name!r} not found.")
    
collection_key = col["data"]["key"]
collection_name = col["data"]["name"]
print(f"Found key for {collection_name} collection:", collection_key)

Found key for Kolkowitz Optical Clocks collection: ZA6MDTWU


In [8]:

PDF_DIR = f"../rag_research_project_vault/Literature/{slugify(collection_name)}_pdfs"
NOTES_DIR = f"../rag_research_project_vault/Literature/{slugify(collection_name)}_notes"

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(NOTES_DIR, exist_ok=True)


In [None]:

# zot.collection_items() yields one “page” at a time
pages = zot.collection_items(collection_key)

# zot.everything() will auto-page through
all_items = zot.everything(pages)

for item in all_items:

    item_key = item['key']

    data = item.get("data", {})
    print(f"Entry for '{data.get('title', '<no title>')}'  —  {item_key}")

    print('\tDOI: ', data.get('DOI', '<no DOI>'))
    print('\turl:', data.get('url', '<no url>'))


    # We will now enrich some of our data via OpenAlex to obtain the correct title, first aiuthor, and publication year and use this to rename the file

    open_alex_entry = Works()[f"doi:{data.get('DOI')}"]

    display_name = open_alex_entry.get('display_name', '<no display_name>')
    publication_year = open_alex_entry.get('publication_year', '<no publication_year>')

    authors_entry = open_alex_entry.get('authorships')

    print(publication_year)

    first_author_entry = next((author.get('author') for author in authors_entry if author.get('author_position') == 'first'), None)

    first_author = first_author_entry.get('display_name')

    # -- now handle the children
    children = zot.children(item_key)
    for child in children:
        cd = child["data"]
        typ = cd.get("itemType")
        
        if typ == "attachment" and cd.get("contentType") == "application/pdf":
            # -- it’s a PDF attachment: download it
            # filename = cd.get("filename") or f"{cd['key']}.pdf"
            filename = sanitize_filename(slugify(f"{first_author}_{publication_year}_{display_name}")) + ".pdf"
            out = zot.dump(
                cd["key"],
                filename=filename,
                path=PDF_DIR,
            )
            print(f"\tDownloaded PDF -> {out}")
        
        elif typ == "note":
            # -- it’s a note: save the HTML
            note_html = cd.get("note", "")

            note_file = NOTES_DIR+ '/' +sanitize_filename(slugify(f"{first_author}_{publication_year}_{display_name}")) + ".md" #os.path.join("notes", f"{cd['key']}.html")
            with open(note_file, "w", encoding="utf-8") as f:
                f.write(note_html)
            print(f"\tSaved note -> {note_file}")
        
        else:
            # other child types (e.g. snapshots, links…) you can ignore or handle separately
            continue

    print('' + '-' * 100)

Entry for 'A lab-based test of the gravitational redshift with a miniature clock network | Nature Communications'  —  HTCQUJJI
	DOI:  10.1038/s41467-023-40629-8
	url: https://www.nature.com/articles/s41467-023-40629-8
2023
	Downloaded PDF -> None
----------------------------------------------------------------------------------------------------
Entry for 'Differential clock comparisons with a multiplexed optical lattice clock'  —  978UPPSC
	DOI:  10.1038/s41586-021-04344-y
	url: https://www.nature.com/articles/s41586-021-04344-y
2022
	Saved note -> ../rag_research_project_vault/Literature/kolkowitz-optical-clocks_notes/xin-zheng-2022-differential-clock-comparisons-with-a-multiplexed-optical-lattice-clock.md
	Downloaded PDF -> None
----------------------------------------------------------------------------------------------------
Entry for 'Optical atomic clocks'  —  QMJHC8B8
	DOI:  10.1103/RevModPhys.87.637
	url: https://link.aps.org/doi/10.1103/RevModPhys.87.637
2015
	Downloaded PDF

In [10]:
Works()["doi:10.1038/s41467-023-40629-8"]

{'id': 'https://openalex.org/W4385784013',
 'doi': 'https://doi.org/10.1038/s41467-023-40629-8',
 'title': 'A lab-based test of the gravitational redshift with a miniature clock network',
 'display_name': 'A lab-based test of the gravitational redshift with a miniature clock network',
 'publication_year': 2023,
 'publication_date': '2023-08-12',
 'ids': {'openalex': 'https://openalex.org/W4385784013',
  'doi': 'https://doi.org/10.1038/s41467-023-40629-8',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/37573452'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.1038/s41467-023-40629-8',
  'pdf_url': 'https://www.nature.com/articles/s41467-023-40629-8.pdf',
  'source': {'id': 'https://openalex.org/S64187185',
   'display_name': 'Nature Communications',
   'issn_l': '2041-1723',
   'issn': ['2041-1723'],
   'is_oa': True,
   'is_in_doaj': True,
   'is_indexed_in_scopus': True,
   'is_core': True,
   'host_organization': 'https://openalex.org/P