In [None]:
import pyterrier as pt
import pandas as pd
if not pt.java.started():
   pt.init()
import time
import json
import requests
import os
import pickle

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def build_api_url_call(doi, email="unpaywall_01@example.com"):
    return f"https://api.unpaywall.org/v2/{doi}?email={email}"

def extract_pdf_url(res):
    res_dict = res.json()

    for _, val in res_dict.items():
        if isinstance(val, dict):
            for k,v in val.items():
                if k == "url_for_pdf":
                    return v
    return None

def fetch_pdf_url(missing_doi):
    try:
        # Simulate a short delay
        time.sleep(1)
        
        # Make the API call with a timeout of 15 seconds
        res = requests.get(build_api_url_call(missing_doi), timeout=15)
        pdf_url = extract_pdf_url(res)
        
        if pdf_url:
            return missing_doi, pdf_url
        return missing_doi, None
    except requests.exceptions.Timeout:
        print(f"HTTP request timed out for DOI: {missing_doi}")
        return missing_doi, None
    except Exception as e:
        print(f"Error processing DOI {missing_doi}: {e}")
        return missing_doi, None

In [3]:
dataset = pt.get_dataset('irds:cord19')

In [None]:
all_dois = [row['doi'] for row in dataset.get_corpus_iter()]
len(set(all_dois))

In [None]:
urls = pickle.load(open("/workspace/next_pdf_urls.pkl", "rb"))
len(urls)

In [None]:
missing_dois = set([doi for doi in all_dois if doi not in urls.keys()])
len(missing_dois)

In [None]:
max_workers = 30

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks to the executor
    future_to_id = {executor.submit(fetch_pdf_url, missing_doi): missing_doi for missing_doi in missing_dois}
    
    # Iterate over completed futures with a progress bar
    for future in tqdm(as_completed(future_to_id), total=len(missing_dois), desc="Processing DOIs"):
        missing_doi = future_to_id[future]
        try:
            # Wait for the result with a timeout of 15 seconds
            missing_doi, pdf_url = future.result(timeout=15)
            if pdf_url:
                urls[missing_doi] = pdf_url
        except TimeoutError:
            print(f"Timeout: DOI {missing_doi} was aborted after 15 seconds.")
            # Optionally attempt to cancel the future (not guaranteed with threads)
            future.cancel()
        except Exception as e:
            print(f"Error processing DOI {missing_doi}: {e}")

In [8]:
pickle.dump(urls, open("/workspace/next_pdf_urls.pkl", "wb"))

In [10]:
87414/135942

0.6430242309220109