In [1]:
! pip install -qU "python-gdcm" pydicom pylibjpeg "opencv-python-headless" "scikit-image" "ipywidgets" "dicomsdl"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.6/216.6 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.42.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.[0m[31m
[0m

In [2]:
# 1. Handle datasets
import io
import os
import cv2
import imageio
import pydicom
import dicomsdl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from glob import glob
import tifffile as tiff
import SimpleITK as sitk
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
from collections import Counter
from joblib import Parallel, delayed
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [3]:
RESIZE_TO = 512
SAVE_DIR = f"/kaggle/working/train_image_processed_jp2000_{RESIZE_TO}"
parent_dir = "/kaggle/input/rsna-breast-cancer-detection"

# create the top-level folder
os.makedirs(SAVE_DIR, exist_ok=True)

# Gather all .dcm paths
all_dcm_files = list(Path(os.path.join(parent_dir, "train_images")).rglob("*.dcm"))
fail_counter = Counter()

def image_resize(image, width = None, height = None, inter = cv2.INTER_LINEAR):
    
    dim = None
    (h, w) = image.shape[:2]
    
    if width is None and height is None:
        return image

    if width is None:
        r = height / float(h)
        dim = (int(w * r), height)
    else:
        r = width / float(w)
        dim = (width, int(h * r))
    
    resized = cv2.resize(image, dim, interpolation=inter)
    return resized

def apply_window(image, window_center, window_width):
    img = image.copy().astype(np.float32)
    min_val = window_center - window_width / 2
    max_val = window_center + window_width / 2
    img = np.clip(img, min_val, max_val)
    img = (img - min_val) / (max_val - min_val)
    return img

def dicom_file_to_array(path):
    dicom = dicomsdl.open(str(path))
    data = dicom.pixelData().astype(np.float32)
    photometric = dicom.getPixelDataInfo()['PhotometricInterpretation']  # Cache this once
    
    if photometric == "MONOCHROME1":
        data = data.max() - data
    
    # ===== Windowing =====
    try:
        center = dicom.getMeta("0028|1050")  # Window Center
        width = dicom.getMeta("0028|1051")   # Window Width
        
        if isinstance(center, list):
            center = float(center[0])
        else:
            center = float(center)
        
        if isinstance(width, list):
            width = float(width[0])
        else:
            width = float(width)
        
        data = apply_window(data, center, width)
        
    except Exception:
        # Fall back to default normalization if window info is missing
        data = (data - data.min()) / (data.max() - data.min())

    # Resize    
    h, w = data.shape
    if w > h:
        data = image_resize(data, width=RESIZE_TO)
    else:
        data = image_resize(data, height=RESIZE_TO)
    
    return (data * 255).astype(np.uint8)

def process(path):
    try:
        parent_folder = path.parent.name
        save_subdir = os.path.join(parent_dir, SAVE_DIR, parent_folder)
        os.makedirs(save_subdir, exist_ok=True)

        processed_img = dicom_file_to_array(path)
        save_path = os.path.join(save_subdir, f"{path.stem}.jp2")
        imageio.imwrite(save_path, processed_img, format='JP2')

    except Exception as e:
        print(f"[ERROR] Failed: {path} — {e}")
        fail_counter["fail"] += 1

# Process with tqdm and joblib
Parallel(n_jobs=16, backend="loky", prefer="threads")(
    delayed(process)(path) for path in tqdm(all_dcm_files, 
                                            total=len(all_dcm_files))
)

print(f"✅ Done! Processed {len(all_dcm_files)} images.")
print(f"❌ Failed: {fail_counter['fail']}")

  0%|          | 0/54706 [00:00<?, ?it/s]

✅ Done! Processed 54706 images.
❌ Failed: 0


In [2]:
def open_jpeg2000_image(image_path):
    # Read the JPEG2000 image using OpenCV
    img = plt.imread(image_path)
    
    plt.imshow(img, cmap="turbo")
    plt.axis('off')
    plt.show()

# Example usage
image_path = os.path.join(SAVE_DIR, "105", "397491913.jp2")
if os.path.exists(image_path):
    open_jpeg2000_image(image_path)
else:
    print("False")


NameError: name 'os' is not defined

In [None]:
# zip up your processed JP2 folder
!zip -r /kaggle/working/processed_jp2.zip /kaggle/working/train_image_processed_jp2000_512


  adding: kaggle/working/train_image_processed_jp2000_512/ (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/65014/ (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/65014/1682583821.jp2 (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/65014/452541249.jp2 (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/65014/799096922.jp2 (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/65014/1035271497.jp2 (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/5188/ (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/5188/287761314.jp2 (stored 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/5188/520900873.jp2 (deflated 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/5188/1528865269.jp2 (deflated 0%)
  adding: kaggle/working/train_image_processed_jp2000_512/5188/533898547.jp2 (deflated 0%)
  adding: kaggle/working/train_image_processed_jp20

In [9]:
!kaggle datasets init -p /kaggle/working/train_image_processed_jp2000_512

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.11/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 433, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [10]:
!kaggle datasets create -p /kaggle/working/train_image_processed_jp2000_512

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 4, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.11/dist-packages/kaggle/__init__.py", line 6, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.11/dist-packages/kaggle/api/kaggle_api_extended.py", line 433, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/
