In [None]:
from tqdm import tqdm
from glob import iglob, glob
from downloader import api
from downloader.wikidata import get_orcid_to_commons_image, get_orcid_to_wikidata
from functools import partial
import os
import pandas as pd
def gen_record(rec, affil,ty):
    return {"orcid": rec.orcid,
             "name": rec.name,
             "type": ty,
             "org_name": affil.name,
             "role": affil.role,
             "country": affil.country,
             "department_name": affil.department,
             "start_year": affil.start.year if affil.start else None,
             "end_year": affil.end.year if affil.end else None
            }


def iter_dir(dirname):
    
    output_name = f"/data/orcid/summ_out/{os.path.basename(dirname)}"
    if os.path.exists(f"{output_name}_affiliations.parquet"):
        print(f"found {output_name}, done")
        return
    
    it = glob(f"{dirname}/*")
    f = partial(
        api.process_file,
        orcid_to_wikidata=None,
        orcid_to_wikimedia_commons=None
    )
    
    print(f"Globbing {dirname}, {len(it)}, {output_name}")
    
    affiliations = []
    n_affil = []
    empty = []
    for i, file in enumerate(it):
        try:
            record: api.Record | None = f(file)
        except AttributeError:
            continue
        if not record:
            empty.append(file)
            continue
        n_rec = 0
        for employment in record.employments:
            n_rec +=1
            affiliations.append(gen_record(record, employment,"employment"))
        for ed in record.educations:
            n_rec +=1
            affiliations.append(gen_record(record, ed,"education"))
        for inv in record.invited_positions:
            n_rec +=1
            affiliations.append(gen_record(record, inv,"invited"))
        n_affil.append({"oid":record.orcid,"file": file, "n_rec":n_rec})
    pd.DataFrame(affiliations).to_parquet(f"{output_name}_affiliations.parquet")
    pd.DataFrame(n_affil).to_parquet(f"{output_name}_naffil.parquet")


from concurrent.futures import ThreadPoolExecutor, as_completed

dirs = glob("/data/orcid/ORCID_2025_10_summaries/*")
from multiprocess import Pool
max_pool = 12

with Pool(max_pool) as p:
    pool_outputs = list(
        #tqdm(
            p.map(iter_dir,dirs)#,
            #total=len(dirs)
        #)
    )    

print(pool_outputs)
new_dict = dict(pool_outputs)