In [1]:
#Libraries from Demo
import os
import shutil
from collections import defaultdict

import pandas as pd
import polars as pl
import pydicom as dicom


#Libraries from attempt
import glob
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy.ndimage as ndi
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os, numpy as np, torch

from collections import Counter
from scipy import ndimage

from scipy.ndimage import zoom as ndi_zoom
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm import tqdm
from typing import Tuple, List

from sklearn.preprocessing import StandardScaler

In [2]:
def seed_everything(seed=42):
    """
    Set random seeds for reproducibility in deep learning projects.
    
    Args:
        seed (int): Random seed value (default: 42)
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything()

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
TRAIN_CSV = "/kaggle/input/rsna-intracranial-aneurysm-detection/train.csv"
test_frac = 0.2
val_frac = 0.1
val_frac_within_trainval = val_frac / (1 - test_frac)
generated_mask_dir = '/kaggle/working/generated_masks'
seed = 42

In [5]:
train = pd.read_csv(TRAIN_CSV)
print(f"On the original Dataset, the percentage of aneurysms is: {100 * sum(train['Aneurysm Present'])/len(train)}%")
print(f"The original dataset has {len(train)} samples.")
print(f"The original dataset has {sum(train['Aneurysm Present']==1)} positive samples.")

On the original Dataset, the percentage of aneurysms is: 42.84728610855566%
The original dataset has 4348 samples.
The original dataset has 1863 positive samples.


In [6]:
train

Unnamed: 0,SeriesInstanceUID,PatientAge,PatientSex,Modality,Left Infraclinoid Internal Carotid Artery,Right Infraclinoid Internal Carotid Artery,Left Supraclinoid Internal Carotid Artery,Right Supraclinoid Internal Carotid Artery,Left Middle Cerebral Artery,Right Middle Cerebral Artery,Anterior Communicating Artery,Left Anterior Cerebral Artery,Right Anterior Cerebral Artery,Left Posterior Communicating Artery,Right Posterior Communicating Artery,Basilar Tip,Other Posterior Circulation,Aneurysm Present
0,1.2.826.0.1.3680043.8.498.10004044428023505108...,64,Female,MRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.2.826.0.1.3680043.8.498.10004684224894397679...,76,Female,MRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.2.826.0.1.3680043.8.498.10005158603912009425...,58,Male,CTA,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,1.2.826.0.1.3680043.8.498.10009383108068795488...,71,Male,MRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.2.826.0.1.3680043.8.498.10012790035410518400...,48,Female,MRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4343,1.2.826.0.1.3680043.8.498.99915610493694667606...,62,Female,MRI T1post,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4344,1.2.826.0.1.3680043.8.498.99920680741054836990...,76,Female,MRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4345,1.2.826.0.1.3680043.8.498.99953513260518059135...,44,Female,CTA,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4346,1.2.826.0.1.3680043.8.498.99982144859397209076...,58,Female,MRI T2,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
train['Aneurysm Present']==1

0       False
1       False
2        True
3       False
4       False
        ...  
4343    False
4344    False
4345    False
4346    False
4347     True
Name: Aneurysm Present, Length: 4348, dtype: bool

In [9]:
PROCESSED_DATA_DIRS_MASKS = [
    "/kaggle/input/binary-masks-dataset/masks_quart1_2_5",
    "/kaggle/input/binary-masks-dataset/masks_quart2_2_5",
    "/kaggle/input/binary-masks-dataset/masks_quart3_2_5",
    "/kaggle/input/binary-masks-dataset/masks_quart4_2_5",
]
mask_sizes = []
all_files = []
sids_empty_not_lost = []
for mdir in PROCESSED_DATA_DIRS_MASKS:
    for mfile in os.listdir(mdir):
        all_files.append(os.path.join(mdir, mfile))
        sids_empty_not_lost.append(mfile[:-4])

print(f'After removing masks fully lost in the 160^3 volume we got from {len(train)} to {len(sids_empty_not_lost)}')

After removing masks fully lost in the 160^3 volume we got from 4348 to 4226


In [10]:
for fpath in tqdm(all_files, desc="Processing masks"):
    mask = np.load(fpath)["vol"].astype(np.float32)
    mask_sizes.append(float(mask.sum()))

Processing masks: 100%|██████████| 4226/4226 [04:02<00:00, 17.46it/s]


In [11]:
max_size = max(mask_sizes)
print("Max mask size using a 1.5mm isotropic resampling with a 50 mm cube size:", max_size)

Max mask size using a 1.5mm isotropic resampling with a 50 mm cube size: 110592.0


In [12]:
from collections import Counter

size_counts = Counter(mask_sizes)
size_counts_sorted = dict(sorted(size_counts.items()))
print(size_counts_sorted)

{0.0: 2485, 936.0: 1, 3456.0: 1, 4032.0: 1, 9216.0: 2, 11520.0: 1, 12168.0: 1, 13776.0: 1, 13824.0: 1, 14976.0: 2, 16128.0: 1, 17280.0: 2, 18432.0: 2, 25200.0: 1, 25344.0: 4, 26112.0: 1, 26928.0: 1, 27648.0: 1, 28704.0: 1, 29304.0: 1, 29376.0: 1, 29952.0: 2, 31104.0: 2, 31992.0: 1, 32256.0: 3, 32640.0: 1, 34560.0: 1, 35712.0: 1, 36288.0: 1, 36864.0: 2, 38016.0: 2, 40320.0: 1, 42624.0: 6, 43776.0: 4, 44928.0: 2, 46080.0: 3, 48384.0: 4, 49536.0: 1, 49680.0: 1, 50688.0: 7, 51840.0: 2, 52992.0: 3, 54144.0: 3, 55296.0: 92, 57600.0: 5, 59904.0: 6, 61824.0: 1, 64512.0: 7, 66816.0: 8, 69120.0: 3, 71424.0: 4, 73728.0: 4, 76032.0: 5, 77760.0: 1, 78336.0: 4, 78432.0: 1, 80640.0: 5, 82944.0: 4, 85248.0: 7, 87552.0: 5, 89856.0: 4, 92160.0: 11, 94464.0: 8, 96768.0: 6, 99072.0: 9, 101376.0: 18, 103680.0: 14, 105984.0: 14, 108288.0: 17, 110592.0: 1398}


In [13]:
original_positive_samples = sum(train['Aneurysm Present']==1)
print(f'Originally we had a total number of postivie samples: {original_positive_samples} out of {len(train)}')

Originally we had a total number of postivie samples: 1863 out of 4348


In [14]:
total = sum(size_counts_sorted.values())

print(f'After normalizing input size to 160 cubic sizes {original_positive_samples- total+2485} were lost, having now {total-2485}')
print(f'That leaves us with {100*(total-2485)/original_positive_samples}%')

After normalizing input size to 160 cubic sizes 122 were lost, having now 1741
That leaves us with 93.45142243692969%


In [15]:
samples_above_80 = 0
samples_above_75 = 0
samples_above_60 = 0
for size, count in size_counts_sorted.items():
    if size > 110592.0 * 0.8:
        samples_above_80 += count
    if size > 110592.0 * 0.75:
        samples_above_75 += count
    if size > 110592.0 * 0.60:
        samples_above_60 += count

print(f'If we want to keep masks that kept 80% of its original size after standardizing we get {samples_above_80} samples.')
print(f'This leaves us with the {100*samples_above_80/original_positive_samples}% of data')
print(f'If we want to keep masks that kept 75% of its original size after standardizing we get {samples_above_75} samples.')
print(f'This leaves us with the {100*samples_above_75/original_positive_samples}% of data')
print(f'If we want to keep masks that kept 60% of its original size after standardizing we get {samples_above_60} samples.')
print(f'This leaves us with the {100*samples_above_60/original_positive_samples}% of data')

If we want to keep masks that kept 80% of its original size after standardizing we get 1499 samples.
This leaves us with the 80.46162104133118% of data
If we want to keep masks that kept 75% of its original size after standardizing we get 1511 samples.
This leaves us with the 81.105743424584% of data
If we want to keep masks that kept 60% of its original size after standardizing we get 1550 samples.
This leaves us with the 83.19914117015567% of data


In [18]:
sids_usefull = []
for fpath in tqdm(all_files, desc="Processing masks"):
    mask = np.load(fpath)["vol"].astype(np.float32)
    size = float(mask.sum())
    if size > 0 and size >= 110592.0 * 0.8:
        sids_usefull.append(fpath[len(mdir)+1:-4])
    if size == 0: 
        sids_usefull.append(fpath[len(mdir)+1:-4])

Processing masks: 100%|██████████| 4226/4226 [03:51<00:00, 18.28it/s]


In [None]:
# print(len(sids_not_usefull)), 
# np.savez_compressed('/kaggle/working/not_usefull.npz', lst=sids_empty_not_lost)
# #242

In [20]:
print(len(sids_usefull)), 
np.savez_compressed('/kaggle/working/usefull.npz', lst=sids_usefull)

3984
