In [1]:
import pyterrier as pt
import pandas as pd

import pickle
import requests
import os

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def doi_2_path(doi, root_path="/workspace/pdfs"):
    help_doi = doi.replace("/", "$")
    return root_path + "/" + help_doi + ".pdf"

def download_mising_pdf(doi, url, path_to_save = "/workspace/pdfs"):
    full_path = doi_2_path(doi, path_to_save)
    try:
        if not os.path.exists(full_path):
            response = requests.get(url)
            with open(full_path, "wb") as f:
                f.write(response.content)
        
        return True
    except:
        print(f"{doi} failed")
        return False

In [None]:
urls = pickle.load(open("/workspace/next_pdf_urls.pkl", "rb"))
len(urls)

In [None]:
max_workers = 15

healthy_cnt = 0
bad_cnt = 0

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Create a dictionary to store the future object and the corresponding missing_id
    future_to_id = {executor.submit(download_mising_pdf, doi, url): doi for doi, url in urls.items()}
    
    # Iterate over completed futures and update the progress bar
    for future in tqdm(as_completed(future_to_id), total=len(urls)):
        res = future.result()
        if res:
            healthy_cnt += 1
        else:
            bad_cnt += 1

print(f"healthy_cnt: {healthy_cnt}, bad_cnt: {bad_cnt}")

In [5]:
#copy random 100 pdfs to /workspace/pdfs_100
import random
import shutil
random.seed(42)

for i in range(100):
    random_pdf = random.choice(os.listdir("/workspace/pdfs"))
    shutil.copy(f"/workspace/pdfs/{random_pdf}", "/workspace/pdfs_100/")