1. Download PDB files:

In [2]:
from Bio.PDB import PDBList
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import sys
import contextlib

def load_pdb_ids(file_path):
    """
    Load PDB IDs from a text file.

    Args:
        file_path (str): Path to the text file containing PDB IDs.

    Returns:
        list: A list of PDB IDs.
    """
    with open(file_path, 'r') as file:
        pdb_ids = file.read().splitlines()
    return pdb_ids

@contextlib.contextmanager
def filter_stdout(filter_words):
    """
    Context manager to filter specific stdout messages.

    Args:
        filter_words (list): List of words to filter out from stdout.

    Yields:
        None
    """
    class FilteredStream:
        def __init__(self, stream):
            self.stream = stream

        def write(self, message):
            if not any(word in message for word in filter_words):
                self.stream.write(message)

        def flush(self):
            self.stream.flush()

    old_stdout = sys.stdout
    sys.stdout = FilteredStream(sys.stdout)
    try:
        yield
    finally:
        sys.stdout = old_stdout

def download_pdb_file(pdb_id, save_dir):
    """
    Download a single PDB file.

    Args:
        pdb_id (str): The PDB ID of the file to download.
        save_dir (str): The directory to save the downloaded PDB file.

    Returns:
        str: A message indicating the result of the download attempt.

    Raises:
        Exception: If there is an error during the download.
    """
    pdbl = PDBList()  # using Biopython's PDBList class
    file_path = os.path.join(save_dir, f"pdb{pdb_id}.ent")

    if not os.path.exists(file_path):
        try:
            with filter_stdout(["Downloading PDB structure", "Desired structure doesn't exist"]):
                pdbl.retrieve_pdb_file(pdb_id, pdir=save_dir, file_format='pdb', overwrite=False)
            return f"Downloaded {pdb_id}"
        except Exception as e:
            return f"Error downloading {pdb_id}: {str(e)}"
    else:
        return f"Skipped {pdb_id}, already exists"

def download_pdb_files(pdb_ids, save_dir='pdb_files', num_threads=16):
    """
    Download PDB files using multiple threads and a progress bar.

    Args:
        pdb_ids (list): List of PDB IDs to download.
        save_dir (str): The directory to save the downloaded PDB files.
        num_threads (int): The number of threads to use for downloading.

    Returns:
        None
    """
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    results = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {executor.submit(download_pdb_file, pdb_id, save_dir): pdb_id for pdb_id in pdb_ids}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading PDB files", unit="file", leave=True):
            results.append(future.result())
    
    for result in results:
        print(result)

# Load PDB IDs from supplement-provided text files
train_pdb_ids = load_pdb_ids('train_ids.txt')
test_pdb_ids = load_pdb_ids('test_ids.txt')

# Download PDB files
download_pdb_files(train_pdb_ids, save_dir='pdb_files/train')
download_pdb_files(test_pdb_ids, save_dir='pdb_files/test')














































Downloading PDB files:   4%|▎         | 4156/115850 [00:00<00:02, 41559.54file/s]


















Downloading PDB files:  11%|█         | 12352/115850 [00:00<00:01, 56749.27file/s]



































Downloading PDB files:  15%|█▌        | 17900/115850 [00:00<00:04, 21140.86file/s]































Downloading PDB files:  18%|█▊        | 21428/115850 [00:01<00:05, 17028.89file/s]




















Downloading PDB files:  21%|██        | 24008/115850 [00:01<00:05, 18365.62file/s]


















Downloading PDB files:  23%|██▎       | 26569/115850 [00:01<00:06, 14687.98file/s]

























Downloading PDB files:  25%|██▍       | 28565/115850 [00:01<00:06, 13725.42file/s]



















Downloading PDB files:  26%|██▌       | 30269/115850 [00:01<00:07, 11692.61file/s]









Downloading PDB files:  28%|██▊       | 32715/115850 [00:01<00:06, 13780.22file/s]











Downloading PDB files:  30%|██▉       | 34426/115850 [00:02<00:05, 13849.51file/s]














Downloading PDB files:  32%|███▏      | 37004/115850 [00:02<00:04, 16337.77file/s]










Downloading PDB files:  34%|███▎      | 38916/115850 [00:02<00:05, 14872.98file/s]















Downloading PDB files:  36%|███▌      | 41339/115850 [00:02<00:04, 16701.89file/s]












Downloading PDB files:  38%|███▊      | 44112/115850 [00:02<00:03, 19337.00file/s]












Downloading PDB files:  40%|███▉      | 46251/115850 [00:02<00:03, 19333.60file/s]















Downloading PDB files:  42%|████▏     | 48770/115850 [00:02<00:03, 20864.08file/s]











Downloading PDB files:  44%|████▍     | 51021/115850 [00:02<00:03, 21310.10file/s]












Downloading PDB files:  47%|████▋     | 54330/115850 [00:02<00:02, 24438.30file/s]












Downloading PDB files:  49%|████▉     | 56855/115850 [00:03<00:02, 21950.72file/s]























Downloading PDB files:  51%|█████     | 59150/115850 [00:03<00:03, 18760.03file/s]














Downloading PDB files:  53%|█████▎    | 61163/115850 [00:03<00:03, 17860.24file/s]













Downloading PDB files:  55%|█████▌    | 64134/115850 [00:03<00:02, 19961.42file/s]












Downloading PDB files:  58%|█████▊    | 66695/115850 [00:03<00:02, 21381.28file/s]














Downloading PDB files:  59%|█████▉    | 68921/115850 [00:03<00:02, 20281.95file/s]











Downloading PDB files:  62%|██████▏   | 71315/115850 [00:03<00:02, 21235.73file/s]






















Downloading PDB files:  66%|██████▌   | 75980/115850 [00:04<00:01, 21377.53file/s]

























Downloading PDB files:  67%|██████▋   | 78166/115850 [00:04<00:01, 20189.30file/s]























Downloading PDB files:  71%|███████   | 82395/115850 [00:04<00:01, 17503.29file/s]























Downloading PDB files:  73%|███████▎  | 84277/115850 [00:04<00:02, 13192.55file/s]

























Downloading PDB files:  76%|███████▌  | 87660/115850 [00:04<00:01, 14522.03file/s]

























Downloading PDB files:  79%|███████▉  | 92076/115850 [00:05<00:01, 18178.78file/s]























Downloading PDB files:  84%|████████▍ | 97839/115850 [00:05<00:00, 21134.76file/s]






















Downloading PDB files:  88%|████████▊ | 102269/115850 [00:05<00:00, 19481.54file/s]























Downloading PDB files:  91%|█████████ | 105252/115850 [00:05<00:00, 22184.17file/s]
























Downloading PDB files:  95%|█████████▍| 109577/115850 [00:05<00:00, 17961.92file/s]

























Downloading PDB files:  96%|█████████▌| 111465/115850 [00:06<00:00, 17899.94file/s]

















Downloading PDB files:  98%|█████████▊| 113319/115850 [00:06<00:00, 16968.67file/s]








Downloading PDB files:  99%|█████████▉| 115062/115850 [00:06<00:00, 14436.57file/s]






























Downloading PDB files: 100%|██████████| 115850/115850 [00:06<00:00, 17407.07file/s]

















Downloading PDB files:   0%|          | 0/6248 [00:00<?, ?file/s]




Downloading PDB files: 100%|█████████▉| 6235/6248 [00:00<00:00, 35239.13file/s]
















Downloading PDB files: 100%|██████████| 6248/6248 [00:00<00:00, 14601.10file/s]







2. (next thing to do):