In [None]:
import os
import platform
from pathlib import Path
import urllib.request
import tarfile
from concurrent.futures import ThreadPoolExecutor

In [None]:
num_workers = max(1, os.cpu_count() // 2)

In [8]:
machine_name = platform.node()  
user = os.getenv("USER") or os.getenv("USERNAME") 
os_name = platform.system()  # Get os
print(f"Machine: {machine_name}, User: {user}, OS: {os_name}")

if machine_name == "Corsair" and os_name == "Linux" and user == "jon":
    windows_drive = Path("/mnt/b/Xray")
    paths = {
        "dataset": windows_drive / "dataset",
        "tar_images": windows_drive / "dataset/images",
        "images": windows_drive / "dataset/images/images",
        "checkpoints": windows_drive / "checkpoints",
        "papers": windows_drive / "papers",
        "models": windows_drive / "models",
    }

    batch_size = 64

else:
    dataset_dir = Path("dataset")
    paths = {
        "dataset": dataset_dir,
        "tar_images": dataset_dir / "images",
        "images": dataset_dir / "images/images",
        "checkpoints": dataset_dir / "checkpoints",
        "papers": dataset_dir / "papers",
        "models": dataset_dir / "models",
    }

for key, path in paths.items():
    path.mkdir(parents=True, exist_ok=True)


Machine: DESKTOP-UHDJ875, User: jonal, OS: Linux


In [9]:
dir_tar_images = paths['tar_images']

path_dataset = paths['dataset']
path_images = paths['images']
path_csv_list = paths['dataset'] / "Data_Entry_2017_v2020.csv"
path_train_val_list = paths['dataset'] / "train_val_list.txt"
path_test_list = paths['dataset'] / "test_list.txt"
path_models = paths['models']
checkpoint_dir = paths['checkpoints']

In [None]:
def download_file(link, folder, idx):
    """
    Downloads a file from a link to the specified folder.
    """
    file_name = f'images_{idx+1:03d}.tar.gz'
    file_path = os.path.join(folder, file_name)
    if os.path.exists(file_path):
        #print(f"{file_name} already exists, skipping download.")
        return file_path
    try:
        print(f"Downloading {file_name}...")
        urllib.request.urlretrieve(link, file_path)
        print(f"{file_name} downloaded successfully.")
        return file_path
    except Exception as e:
        print(f"Failed to download {file_name}: {e}")
        return None

In [None]:
def extract_file(file_path, folder):
    """
    Extracts a .tar.gz file to the specified folder.
    """
    extracted_flag = file_path.replace('.tar.gz', '_extracted.flag')
    if os.path.exists(extracted_flag):
        #print(f"{os.path.basename(file_path)} already extracted, skipping.")
        return
    try:
        print(f"Extracting {os.path.basename(file_path)}...")
        with tarfile.open(file_path, 'r:gz') as tar:
            tar.extractall(path=folder)
        with open(extracted_flag, 'w') as f:
            f.write('extracted')
        print(f"{os.path.basename(file_path)} extracted successfully.")
    except Exception as e:
        print(f"Failed to extract {os.path.basename(file_path)}: {e}")

In [None]:
def process_link(idx, link):
    """
    Handles downloading and extracting a single link.
    """
    file_path = download_file(link, dir_tar_images, idx)
    if file_path:
        extract_file(file_path, dir_tar_images)

## Download data

In [17]:
links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
	'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
	'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
	'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
	'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
	'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
	'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
	'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]

with ThreadPoolExecutor(max_workers=num_workers) as executor:
    executor.map(lambda args: process_link(*args), enumerate(links))

print("Download and extraction complete. Please check the extracted files.")

Downloading images_001.tar.gz...
Downloading images_002.tar.gz...
Downloading images_003.tar.gz...
Downloading images_004.tar.gz...
Downloading images_005.tar.gz...
Downloading images_006.tar.gz...
Downloading images_007.tar.gz...
Downloading images_008.tar.gz...
images_001.tar.gz downloaded successfully.
Extracting images_001.tar.gz...
images_004.tar.gz downloaded successfully.
Extracting images_004.tar.gz...
images_008.tar.gz downloaded successfully.
Extracting images_008.tar.gz...
images_007.tar.gz downloaded successfully.
Extracting images_007.tar.gz...
images_002.tar.gz downloaded successfully.
Extracting images_002.tar.gz...
images_006.tar.gz downloaded successfully.
Extracting images_006.tar.gz...
images_005.tar.gz downloaded successfully.
Extracting images_005.tar.gz...
images_003.tar.gz downloaded successfully.
Extracting images_003.tar.gz...
images_001.tar.gz extracted successfully.
Downloading images_009.tar.gz...
images_009.tar.gz downloaded successfully.
Extracting images_