In [1]:
import os
import pandas as pd
import pickle

def id_loc(id_check, main_directory="../../../uk_biobank"):
    """
    Search for 'fields.ukb' files one directory down from main_directory,
    and return a dict mapping each ID to a list of file paths where it is found.
    """
    # Ensure all IDs are integers for comparison
    id_check = set(int(i) for i in id_check)
    id_to_files = {i: [] for i in id_check}

    for entry in os.listdir(main_directory):
        dir_path = os.path.join(main_directory, entry)
        if not os.path.isdir(dir_path):
            continue

        target_file_path = os.path.join(dir_path, "fields.ukb")
        if not os.path.isfile(target_file_path):
            continue

        try:
            df = pd.read_csv(target_file_path,
                             sep=r'\s+',
                             header=None,
                             usecols=[0],
                             dtype={0: int})
        except Exception as e:
            print(f"Skipping {target_file_path}: could not read ({e})")
            continue

        present_ids = set(df[0]).intersection(id_check)
        for found_id in present_ids:
            id_to_files[found_id].append(target_file_path)

    return id_to_files

with open('../../../randy/rfb/tidy_data/UKBiobank/dementia/cognitive_tests/cognitive_columns.pkl', 'rb') as f:
    cognitive_tests = pickle.load(f)

for x in range(len(cognitive_tests)):
    cognitive_tests[x] = int(cognitive_tests[x].split('-')[0])

# Example usage:
if __name__ == "__main__":
    ids_to_check = cognitive_tests
    result = id_loc([21001, 4282, 20023, 20016, 6348, 6349, 6350, 6351, 6333, 6325, 20197, 20018, 400])
    print(result)
    print("Search completed.")

{20197: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 21001: ['../../../uk_biobank/project_52887_41230/fields.ukb', '../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 6348: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 6349: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 6350: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 6351: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 20016: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 400: ['../../../uk_biobank/project_52887_42640/fields.ukb', '../../../uk_biobank/project_52887_669338/fields.ukb'], 20018: 

In [4]:
any('../../../uk_biobank/project_52887_669338/ukb669338.csv' in files for files in result.values())

False