In [1]:
# Basic Libraries
import os
import shutil
import random
import numpy as np
import pandas as pd

# Image Processing
import cv2
from PIL import Image, ImageEnhance
from skimage.util import random_noise

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# TensorFlow and Keras for Deep Learning
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Attention, Add, Dense, GlobalAveragePooling2D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from tensorflow.keras.models import load_model, Sequential

# Scikit-learn for Model Preparation
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Additional Libraries for Image Handling and File Operations
import glob
from glob import glob
import matplotlib.image as mpimg
import pydicom

from tensorflow.keras import layers
from tensorflow.keras.models import load_model, Sequential


print("Imports Complete")

Imports Complete


In [2]:
# Load datasets
df_train = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv")
df_train_series_descriptions = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv")
df_label_coord = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv")
#df_train_imagess = pd.read_csv("/kaggle/working/df_png_paths.csv")

# Load datasets
df_test_series_descriptions = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/test_series_descriptions.csv")
#df_train_imagess = pd.read_csv("/kaggle/working/df_png_paths.csv")

# Output Paths
output_path = '/kaggle/working/train_images'
# Define the directory where your augmented images are saved
augmented_images_dir = '/kaggle/working/augmented_images'

# Path to the input and output directories
input_path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/'
test_input_path= '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/test_images/'


In [116]:
# Create image paths
df_label_coord['image_path'] = "/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/" + \
                               df_label_coord['study_id'].astype(str) + "/" + \
                               df_label_coord['series_id'].astype(str) + "/" + \
                               df_label_coord['instance_number'].astype(str) + ".dcm"

# Melt the df_train DataFrame
df_train_melted = df_train.melt(id_vars=['study_id'], var_name='condition_level', value_name='severity')

# Split 'condition_level' to extract 'condition' and 'level'
df_train_melted[['conditions', 'level']] = df_train_melted['condition_level'].str.rsplit('_', n=2, expand=True).iloc[:, 1:]
df_train_melted['condition'] = df_train_melted['condition_level'].apply(lambda x: '_'.join(x.split('_')[:-2])).str.replace("_", " ").str.title()
df_train_melted['level'] = df_train_melted['conditions'].str.upper() + "/" + df_train_melted['level'].str.upper()

# Drop the original 'condition_level' column
df_train_melted = df_train_melted.drop(columns=['condition_level', 'conditions'])

# Merge DataFrames on 'study_id', 'level', and 'condition'
df_final = pd.merge(df_label_coord, df_train_melted, on=['study_id', 'level', 'condition'], how='inner')

# Ensure the 'series_description' column exists before trying to reorder
if 'series_description' in df_train_series_descriptions.columns:
    # Merge df_final with df_train_series_descriptions on 'study_id' and 'series_id'
    df_final_filtered = pd.merge(df_final, df_train_series_descriptions[['study_id', 'series_id', 'series_description']],
                                 on=['study_id', 'series_id'], how='left')

    # Reorder columns to place 'series_description' immediately after 'series_id'
    columns_order = ['study_id', 'series_id', 'series_description', 'instance_number', 'condition', 'level', 'x', 'y', 'image_path', 'severity']
    
    # Ensure that 'series_description' exists in the DataFrame before reordering columns
    if 'series_description' in df_final_filtered.columns:
        df_final_filtered = df_final_filtered[columns_order]
    else:
        print("Warning: 'series_description' column not found after merging.")
else:
    print("Warning: 'series_description' column not found in the input data.")
    
df_final_filtered.sample()

Unnamed: 0,study_id,series_id,series_description,instance_number,condition,level,x,y,image_path,severity
44457,3912497560,354120796,Sagittal T2/STIR,11,Spinal Canal Stenosis,L3/L4,203.203969,205.375064,/kaggle/input/rsna-2024-lumbar-spine-degenerat...,Severe


In [4]:
# Path to the output directory
output_path = '/kaggle/working/train_images/'

# Function to convert DICOM pixel array to PNG
def readdcm_writepng_image(src_dicom_pixelarray, dest_path_png):
    src_dicom_pixelarray = np.array(src_dicom_pixelarray)
    standardized_image_data = ((src_dicom_pixelarray - src_dicom_pixelarray.min()) / 
                               (src_dicom_pixelarray.max() - src_dicom_pixelarray.min() + 1e-10)) * 255
    standardized_image_data = standardized_image_data.astype(np.uint8)
    final_image_to_png = cv2.resize(standardized_image_data, (320, 320), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(dest_path_png, final_image_to_png)

# Remove previous output directory for fresh writing
if os.path.isdir(output_path):
    shutil.rmtree(output_path)

# Drop duplicates based on 'image_path' to ensure each image is converted only once
unique_images_df = df_final_filtered.drop_duplicates(subset='image_path')

# Create a new DataFrame to store paths to the converted images
df_png_paths = pd.DataFrame(columns=df_final_filtered.columns)

# Convert only unique labeled images
for index, row in tqdm(unique_images_df.iterrows(), total=len(unique_images_df)):
    study_id = row['study_id']
    # Apply the replacement to series_description
    series_description = row['series_description'].replace(' ', '_').replace('/', '_')
    instance_number = row['instance_number']
    
    # Construct the destination path for the PNG file
    dest_path = f'{output_path}/{study_id}/{series_description}/{instance_number}.png'
    
    # Ensure directory exists
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    
    # Read the DICOM image and convert it to PNG
    dicom_image = pydicom.dcmread(row['image_path'])
    readdcm_writepng_image(dicom_image.pixel_array, dest_path)
    
    # Copy the row and update the image path to the new PNG path
    new_row = row.copy()
    new_row['image_path'] = dest_path
    
    # Replace series_description in the new_row DataFrame
    new_row['series_description'] = series_description
    
    # Append the new row to the new DataFrame using pd.concat
    df_png_paths = pd.concat([df_png_paths, pd.DataFrame([new_row])], ignore_index=True)

print("Conversion to PNG completed.")

# Save the new DataFrame to a CSV file (optional)
df_png_paths.to_csv('/kaggle/working/df_png_paths.csv', index=False)


print("Dataframe saved.")

  df_png_paths = pd.concat([df_png_paths, pd.DataFrame([new_row])], ignore_index=True)
100%|██████████| 24546/24546 [10:22<00:00, 39.43it/s]


Conversion to PNG completed.
Dataframe saved.


In [15]:
from concurrent.futures import ThreadPoolExecutor
from albumentations import (
    HorizontalFlip, VerticalFlip, Rotate, RandomBrightnessContrast,
    ColorJitter, GridDistortion, RandomGamma, GaussNoise, Compose,
    CLAHE, Solarize, Posterize, ShiftScaleRotate, ElasticTransform,
    ToGray, HueSaturationValue
)

# Step 1: Initialise Paths
df_converted_data = pd.read_csv("/kaggle/working/df_png_paths.csv")
output_images_dir = '/kaggle/working/augmented_images'
csv_output_path = '/kaggle/working/df_augmented_final.csv'

# Ensure output directory exists
os.makedirs(output_images_dir, exist_ok=True)

# Step 2: Assume df_png_paths is already defined with the necessary data
# You need to format the series_description
df_augmented = df_png_paths.copy()
df_augmented['series_description'] = df_augmented['series_description'].str.replace(r'[ /]', '_', regex=True)

# Step 3: Define color map augmentation functions
def apply_color_map(image, colormap):
    return cv2.applyColorMap(image, colormap)

# Step 4: Define augmentation techniques
albumentations_augmentations = [
    Compose([Rotate(limit=90), HorizontalFlip()]),
    Compose([Rotate(limit=180)]),
    Compose([Rotate(limit=270), HorizontalFlip()]),
    Compose([ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2)]),
    Compose([GaussNoise(), VerticalFlip()]),
    Compose([GridDistortion()]),
    Compose([ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15)]),
    Compose([ElasticTransform(alpha=1, sigma=50, alpha_affine=None)]),  # Updated line
    Compose([CLAHE(), HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20)]),
    Compose([Solarize(threshold=128.0), Posterize(num_bits=4)]),
    Compose([ToGray()])
]

color_map_augmentations = [
    (cv2.COLORMAP_VIRIDIS, 'viridis'),
    (cv2.COLORMAP_PLASMA, 'plasma'),
    (cv2.COLORMAP_INFERNO, 'inferno'),
    (cv2.COLORMAP_MAGMA, 'magma'),
]

# Combine all augmentations into one list
all_augmentations = albumentations_augmentations + color_map_augmentations

# Define how many times to augment each image for Moderate and Severe classes
num_augmentations_per_image_severe = 10 # Augment each 'Severe' image 6 times
num_augmentations_per_image_moderate = 4  # Augment each 'Moderate' image 1 time

def augment_image(row):
    image_path = row['image_path']
    image = cv2.imread(image_path)  # Load the image using OpenCV

    # Check if the image was loaded successfully
    if image is None:
        print(f"Warning: Unable to load image at path: {image_path}")
        return []  # Return an empty list if the image could not be loaded

    coords = [row['x'], row['y']]  # Extract coordinates
    augmented_images = []  # Store augmented images for this row
    image_height, image_width = image.shape[:2]

    # Determine the number of augmentations based on severity
    if row['severity'] == 'Severe':
        num_augmentations = num_augmentations_per_image_severe
    elif row['severity'] == 'Moderate':
        num_augmentations = num_augmentations_per_image_moderate
    else:
        return []  # Skip if severity is not 'Moderate' or 'Severe'

    for _ in range(num_augmentations):
        # Choose an augmentation
        aug_index = np.random.choice(len(all_augmentations))
        aug = all_augmentations[aug_index]

        try:
            if isinstance(aug, tuple):
                # Apply the color map augmentation
                colormap, name = aug
                image_aug = apply_color_map(image, colormap)
                aug_name = name  # Use the color map name directly
            else:
                # Apply the Albumentations augmentation
                augmented = aug(image=image)
                image_aug = augmented['image']

                # Get the augmentation names
                aug_name = '_'.join([type(t).__name__ for t in aug.transforms])

                # Update coordinates based on the applied transformations
                for t in aug.transforms:
                    if isinstance(t, HorizontalFlip):
                        coords[0] = image_width - coords[0]
                    if isinstance(t, VerticalFlip):
                        coords[1] = image_height - coords[1]
                    if isinstance(t, Rotate):
                        angle = t.limit if isinstance(t.limit, (int, float)) else t.limit[1]
                        if angle == 90:
                            coords = [coords[1], image_width - coords[0]]
                        elif angle == 180:
                            coords = [image_width - coords[0], image_height - coords[1]]
                        elif angle == 270:
                            coords = [image_height - coords[1], coords[0]]

            # Create subfolder structure
            study_id = row['study_id']
            series_id = row['series_id']
            series_description = row['series_description'].replace(' ', '_')  # Replace spaces with underscores
            output_subfolder = os.path.join(output_images_dir, str(study_id), series_description)
            os.makedirs(output_subfolder, exist_ok=True)

            # Generate new file name with the augmentation name and instance number
            instance_number = row['instance_number']
            augmented_image_path = os.path.join(output_subfolder, f"{aug_name}_{instance_number}.png")

            # Save the augmented image
            cv2.imwrite(augmented_image_path, image_aug)

            augmented_images.append({
                'study_id': study_id,
                'series_id': series_id,
                'series_description': series_description,
                'instance_number': instance_number,
                'x': coords[0],
                'y': coords[1],
                'condition': row['condition'],
                'level': row['level'],
                'image_path': augmented_image_path,
                'severity': row['severity']
            })
        except Exception as e:
            print(f"Error processing image {image_path}: {e}")

    return augmented_images


# Step 7: Filter only Moderate and Severe classes for augmentation
df_filtered = df_augmented[df_augmented['severity'].isin(['Moderate', 'Severe'])]

# Step 8: Use parallel processing to augment images
augmented_data = []

with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(augment_image, [row for _, row in df_filtered.iterrows()]), total=len(df_filtered)))

# Flatten the results and filter out None values
augmented_data = [item for sublist in results for item in sublist if item is not None]

# Step 9: Collect the results into a DataFrame
df_augmented_final = pd.DataFrame(augmented_data)

# Save the augmented DataFrame to a CSV file
df_augmented_final.to_csv(csv_output_path, index=False)

print(f"Total processed images: {len(augmented_data)}")


100%|██████████| 5418/5418 [01:57<00:00, 46.03it/s]


Total processed images: 31494


In [124]:
final_merged_df = pd.read_csv("/kaggle/working/df_png_paths.csv")
final_merged_df

Unnamed: 0,study_id,series_id,series_description,instance_number,condition,level,x,y,image_path,severity
0,4003253,702807833,Sagittal_T2_STIR,8,Spinal Canal Stenosis,L1/L2,322.831858,227.964602,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild
1,4003253,1054713880,Sagittal_T1,4,Right Neural Foraminal Narrowing,L4/L5,187.961759,251.839388,/kaggle/working/train_images//4003253/Sagittal...,Moderate
2,4003253,1054713880,Sagittal_T1,5,Right Neural Foraminal Narrowing,L3/L4,187.227533,210.722753,/kaggle/working/train_images//4003253/Sagittal...,Moderate
3,4003253,1054713880,Sagittal_T1,6,Right Neural Foraminal Narrowing,L1/L2,194.569790,127.755258,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild
4,4003253,1054713880,Sagittal_T1,11,Left Neural Foraminal Narrowing,L1/L2,196.070671,126.021201,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild
...,...,...,...,...,...,...,...,...,...,...
24541,4290709089,3390218084,Axial_T2,21,Right Subarticular Stenosis,L5/S1,302.875911,364.627811,/kaggle/working/train_images//4290709089/Axial...,Normal/Mild
24542,4290709089,4237840455,Sagittal_T1,4,Right Neural Foraminal Narrowing,L2/L3,208.106799,140.203404,/kaggle/working/train_images//4290709089/Sagit...,Normal/Mild
24543,4290709089,4237840455,Sagittal_T1,5,Right Neural Foraminal Narrowing,L1/L2,219.405706,95.459321,/kaggle/working/train_images//4290709089/Sagit...,Normal/Mild
24544,4290709089,4237840455,Sagittal_T1,11,Left Neural Foraminal Narrowing,L1/L2,219.465940,97.831063,/kaggle/working/train_images//4290709089/Sagit...,Normal/Mild


In [143]:
from glob import glob

df_test_series = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/test_series_descriptions.csv")
test_images_path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/test_images'
output_path = '/kaggle/working/RSNA_test_images_png/'

# Function to convert DICOM pixel array to PNG
def readdcm_writepng_image(src_dicom_pixelarray, dest_path_png):
    src_dicom_pixelarray = np.array(src_dicom_pixelarray)
    standardized_image_data = ((src_dicom_pixelarray - src_dicom_pixelarray.min()) / 
                               (src_dicom_pixelarray.max() - src_dicom_pixelarray.min() + 1e-10)) * 255
    standardized_image_data = standardized_image_data.astype(np.uint8)
    final_image_to_png = cv2.resize(standardized_image_data, (320, 320), interpolation=cv2.INTER_CUBIC)
    cv2.imwrite(dest_path_png, final_image_to_png)

# Remove previous output directory for fresh writing
if os.path.isdir(output_path):
    shutil.rmtree(output_path)


# Iterate over the test data
for idx, row in tqdm(df_test_series.iterrows(), total=len(df_test_series)):
    study_id = row['study_id']
    series_id = row['series_id']
    series_desc = row['series_description'].replace(' ', '_').replace('/', '_')
    
    # Define the new directory structure for PNGs
    series_output_dir = f'{output_path}/{study_id}/{series_desc}'
    os.makedirs(series_output_dir, exist_ok=True)
    
    # Get all DICOM files in this series
    series_dicom_dir = f'{test_images_path}/{study_id}/{series_id}'
    dicom_files = glob(f'{series_dicom_dir}/*.dcm')
    
    # Convert each DICOM file to PNG
    for dicom_file in dicom_files:
        dicom_image = pydicom.dcmread(dicom_file)
        image_filename = os.path.splitext(os.path.basename(dicom_file))[0]  # Use SOPInstanceUID for naming
        image_dicom_pixelarray = dicom_image.pixel_array
        
        dest_path = f'{series_output_dir}/{image_filename}.png'
        readdcm_writepng_image(image_dicom_pixelarray, dest_path)

100%|██████████| 3/3 [00:02<00:00,  1.30it/s]


In [181]:
# read data
train_path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/'
train  = pd.read_csv(train_path + 'train.csv')
label = pd.read_csv(train_path + 'train_label_coordinates.csv')
train_desc  = pd.read_csv(train_path + 'train_series_descriptions.csv')
test_desc   = pd.read_csv(train_path + 'test_series_descriptions.csv')
sub         = pd.read_csv(train_path + 'sample_submission.csv')

# Replace spaces and forward slashes in the 'series_description' column with underscores
test_desc['series_description'] = test_desc['series_description'].str.replace(r'[ /]', '_', regex=True)

# Display the updated DataFrame
print(test_desc.head(5))


   study_id   series_id series_description
0  44036939  2828203845        Sagittal_T1
1  44036939  3481971518           Axial_T2
2  44036939  3844393089   Sagittal_T2_STIR


In [182]:
import pandas as pd

# Create the row_id column
final_merged_df['row_id'] = (
    final_merged_df['study_id'].astype(str) + '_' +
    final_merged_df['condition'].str.lower().str.replace(' ', '_') + '_' +
    final_merged_df['level'].str.lower().str.replace('/', '_')
)

# Note: Check image path, since there's 1 instance id, for 1 image, but there's many more images other than the ones labelled in the instance ID. 

# Display the updated dataframe
final_merged_df.head(5)

Unnamed: 0,study_id,series_id,series_description,instance_number,condition,level,x,y,image_path,severity,row_id
0,4003253,702807833,Sagittal_T2_STIR,8,Spinal Canal Stenosis,L1/L2,322.831858,227.964602,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild,4003253_spinal_canal_stenosis_l1_l2
1,4003253,1054713880,Sagittal_T1,4,Right Neural Foraminal Narrowing,L4/L5,187.961759,251.839388,/kaggle/working/train_images//4003253/Sagittal...,Moderate,4003253_right_neural_foraminal_narrowing_l4_l5
2,4003253,1054713880,Sagittal_T1,5,Right Neural Foraminal Narrowing,L3/L4,187.227533,210.722753,/kaggle/working/train_images//4003253/Sagittal...,Moderate,4003253_right_neural_foraminal_narrowing_l3_l4
3,4003253,1054713880,Sagittal_T1,6,Right Neural Foraminal Narrowing,L1/L2,194.56979,127.755258,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild,4003253_right_neural_foraminal_narrowing_l1_l2
4,4003253,1054713880,Sagittal_T1,11,Left Neural Foraminal Narrowing,L1/L2,196.070671,126.021201,/kaggle/working/train_images//4003253/Sagittal...,Normal/Mild,4003253_left_neural_foraminal_narrowing_l1_l2


In [192]:
# Define the base path for test images
base_path = '/kaggle/working/RSNA_test_images_png'

# Function to get image paths for a series
def get_image_paths(row):
    series_path = os.path.join(base_path, str(row['study_id']), str(row['series_description']))
    if os.path.exists(series_path):
        return [os.path.join(series_path, f) for f in os.listdir(series_path) if os.path.isfile(os.path.join(series_path, f))]
    return []

# Mapping of series_description to conditions
condition_mapping = {
    'Sagittal_T1': {'left': 'left_neural_foraminal_narrowing', 'right': 'right_neural_foraminal_narrowing'},
    'Axial_T2': {'left': 'left_subarticular_stenosis', 'right': 'right_subarticular_stenosis'},
    'Sagittal_T2_STIR': 'spinal_canal_stenosis'
}

# Create a list to store the expanded rows
expanded_rows = []

# Expand the dataframe by adding new rows for each file path
for index, row in test_desc.iterrows():
    image_paths = get_image_paths(row)
    conditions = condition_mapping.get(row['series_description'], {})
    if isinstance(conditions, str):  # Single condition
        conditions = {'left': conditions, 'right': conditions}
    for side, condition in conditions.items():
        for image_path in image_paths:
            expanded_rows.append({
                'study_id': row['study_id'],
                'series_id': row['series_id'],
                'series_description': row['series_description'],
                'image_path': image_path,
                'condition': condition,
                'row_id': f"{row['study_id']}_{condition}"
            })

# Create a new dataframe from the expanded rows
expanded_test_desc = pd.DataFrame(expanded_rows)

# Display the resulting dataframe
expanded_test_desc.head(5)

Unnamed: 0,study_id,series_id,series_description,image_path,condition,row_id
0,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing
1,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing
2,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing
3,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing
4,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing


In [193]:
# Extracting the instance number from the image_path
expanded_test_desc['instance_number'] = expanded_test_desc['image_path'].apply(
    lambda x: int(os.path.splitext(os.path.basename(x))[0])  # Get the filename without extension
)

# Display the updated DataFrame
expanded_test_desc.head(2)

Unnamed: 0,study_id,series_id,series_description,image_path,condition,row_id,instance_number
0,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing,20
1,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing,15


In [194]:
# Function to generate image paths based on directory structure
def generate_image_paths(df, data_dir):
    image_paths = []
    for study_id, series_id in zip(df['study_id'], df['series_description']):
        study_dir = os.path.join(data_dir, str(study_id))
        series_dir = os.path.join(study_dir, str(series_id))
        images = os.listdir(series_dir)
        image_paths.extend([os.path.join(series_dir, img) for img in images])
    return image_paths

path='/kaggle/working'

# Generate image paths for train and test data
#train_image_paths = generate_image_paths(train_desc, f'{path}/train_images')
test_image_paths = generate_image_paths(test_desc, f'{path}/RSNA_test_images_png')

In [195]:
levels = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']

# Function to update row_id with levels
def update_row_id(row, levels):
    level = levels[row.name % len(levels)]
    return f"{row['study_id']}_{row['condition']}_{level}"

# Update row_id in expanded_test_desc to include levels
expanded_test_desc['row_id'] = expanded_test_desc.apply(lambda row: update_row_id(row, levels), axis=1)


In [210]:
expanded_test_desc

Unnamed: 0,study_id,series_id,series_description,image_path,condition,row_id,instance_number
0,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l1_l2,20
1,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l2_l3,15
2,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l3_l4,23
3,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l4_l5,22
4,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l5_s1,5
...,...,...,...,...,...,...,...
189,44036939,3844393089,Sagittal_T2_STIR,/kaggle/working/RSNA_test_images_png/44036939/...,spinal_canal_stenosis,44036939_spinal_canal_stenosis_l5_s1,2
190,44036939,3844393089,Sagittal_T2_STIR,/kaggle/working/RSNA_test_images_png/44036939/...,spinal_canal_stenosis,44036939_spinal_canal_stenosis_l1_l2,21
191,44036939,3844393089,Sagittal_T2_STIR,/kaggle/working/RSNA_test_images_png/44036939/...,spinal_canal_stenosis,44036939_spinal_canal_stenosis_l2_l3,18
192,44036939,3844393089,Sagittal_T2_STIR,/kaggle/working/RSNA_test_images_png/44036939/...,spinal_canal_stenosis,44036939_spinal_canal_stenosis_l3_l4,3


In [197]:
# Drop rows with severity equal to 0 or NaN
df_final_filtered_cleaned = df_png_paths[(df_png_paths['severity'] != 0) & (df_png_paths['severity'].notna())]
df_augmented_cleaned = df_augmented_final[(df_augmented_final['severity'] != 0) & (df_augmented_final['severity'].notna())]

# Display the resulting DataFrame
print(f"Data after removing rows with severity 0 or NaN: {df_final_filtered_cleaned.shape[0]} samples")
print(f"Data after removing rows with severity 0 or NaN: {df_augmented_cleaned.shape[0]} samples")

# Concatenate the cleaned DataFrames
df_concat = pd.concat([df_final_filtered_cleaned, df_augmented_cleaned], ignore_index=True)

# Check the class distribution after balancing
print(df_concat["severity"].value_counts())

Data after removing rows with severity 0 or NaN: 24526 samples
Data after removing rows with severity 0 or NaN: 31494 samples
severity
Normal/Mild    19108
Moderate       18905
Severe         18007
Name: count, dtype: int64


In [142]:
# List to store paths of corrupted files
corrupted_files = []

# Check each image in the dataset
for index, row in df_concat.iterrows():
    img_path = row['image_path']
    try:
        # Try to open the image file
        img = Image.open(img_path)
        img.verify()  # Verify that it is a valid image
    except (IOError, SyntaxError) as e:
        corrupted_files.append(img_path)

# Remove corrupted files from the DataFrame
df_concat_cleaned = df_concat[~df_concat['image_path'].isin(corrupted_files)]

# Create the final augmented DataFrame with cleaned data
df_dataset = df_concat_cleaned.copy()

# Print the number of corrupted files found and removed
print(f"Number of corrupted files removed: {len(corrupted_files)}")

# Print the number of valid rows in the final DataFrame
print(f"Number of valid rows in the final DataFrame: {df_dataset.shape[0]}")

Number of corrupted files removed: 6
Number of valid rows in the final DataFrame: 56014


In [166]:
num_classes = df_dataset['series_description'].nunique()
num_classes

3

In [167]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define your CNN model
def create_cnn_model(input_shape, num_classes):
    model = models.Sequential()
    
    # First convolutional block
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    
    # Second convolutional block
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    
    # Third convolutional block
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    
    # Fully connected layers
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    
    # Output layer for 3 classes (severity)
    model.add(layers.Dense(3, activation='softmax'))
    
    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Set parameters
image_size = (256, 256)  # Target size for images
batch_size = 32
epochs = 10
input_shape = (*image_size, 3)  # Assuming RGB images
num_classes = 3  # Severity has 3 categories

# Assuming df_dataset is your DataFrame containing the dataset with 'severity' and 'series_description'

# Get unique series_descriptions from the DataFrame
series_descriptions = df_dataset['series_description'].unique()

# Iterate through each series_description
for series in series_descriptions:
    print(f"\nTraining model for series description: {series}")
    
    # Filter the DataFrame for the current series_description
    series_df = df_dataset[df_dataset['series_description'] == series]
    
    # Ensure we have 3 classes in the 'severity' column
    assert series_df['severity'].nunique() == 3, "The 'severity' column must have exactly 3 classes"
    
    # Split the data into train, validation, and test sets
    train_df, test_df = train_test_split(series_df, test_size=0.3, stratify=series_df['severity'])
    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['severity'])

    # ImageDataGenerator setup
    datagen = ImageDataGenerator(rescale=1.0/255)

    # Create ImageDataGenerators for train, validation, and test sets
    train_generator = datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='image_path',
        y_col='severity',
        target_size=image_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=True
    )

    val_generator = datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col='image_path',
        y_col='severity',
        target_size=image_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False
    )

    test_generator = datagen.flow_from_dataframe(
        dataframe=test_df,
        x_col='image_path',
        y_col='severity',
        target_size=image_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False
    )

    # Create CNN model for the current series_description
    model = create_cnn_model(input_shape, num_classes)

    # Train the model
    history = model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=epochs,
        verbose=1
    )

    # Save the model after training
    model.save(f"{series}_model.h5")

    # Evaluate the model on the test set
    test_loss, test_acc = model.evaluate(test_generator)
    print(f"Test Accuracy for {series}: {test_acc:.4f}")



Training model for series description: Sagittal_T2_STIR
Found 2230 validated image filenames belonging to 3 classes.
Found 558 validated image filenames belonging to 3 classes.
Found 1195 validated image filenames belonging to 3 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


  self._warn_if_super_not_called()
I0000 00:00:1726963000.054291     857 service.cc:145] XLA service 0x7f56f4004a10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726963000.054359     857 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1726963000.054364     857 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m 2/70[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3s[0m 59ms/step - accuracy: 0.3828 - loss: 2.0181  

I0000 00:00:1726963008.280203     857 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 236ms/step - accuracy: 0.5350 - loss: 1.2093 - val_accuracy: 0.6470 - val_loss: 0.7872
Epoch 2/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 128ms/step - accuracy: 0.6845 - loss: 0.7445 - val_accuracy: 0.7348 - val_loss: 0.6047
Epoch 3/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 121ms/step - accuracy: 0.7886 - loss: 0.4881 - val_accuracy: 0.7849 - val_loss: 0.4821
Epoch 4/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 119ms/step - accuracy: 0.9004 - loss: 0.2630 - val_accuracy: 0.8387 - val_loss: 0.4729
Epoch 5/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 121ms/step - accuracy: 0.9434 - loss: 0.1501 - val_accuracy: 0.8584 - val_loss: 0.5801
Epoch 6/10
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 120ms/step - accuracy: 0.9696 - loss: 0.0885 - val_accuracy: 0.8799 - val_loss: 0.6669
Epoch 7/10
[1m70/70[0m [32m━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


  self._warn_if_super_not_called()


[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 181ms/step - accuracy: 0.5258 - loss: 1.0480 - val_accuracy: 0.5978 - val_loss: 0.8464
Epoch 2/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 130ms/step - accuracy: 0.6188 - loss: 0.8124 - val_accuracy: 0.6644 - val_loss: 0.7339
Epoch 3/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 128ms/step - accuracy: 0.7305 - loss: 0.6093 - val_accuracy: 0.7330 - val_loss: 0.6259
Epoch 4/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 129ms/step - accuracy: 0.8539 - loss: 0.3643 - val_accuracy: 0.7875 - val_loss: 0.6074
Epoch 5/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 130ms/step - accuracy: 0.9417 - loss: 0.1601 - val_accuracy: 0.8276 - val_loss: 0.6251
Epoch 6/10
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 132ms/step - accuracy: 0.9752 - loss: 0.0799 - val_accuracy: 0.8252 - val_loss: 0.8097
Epoch 7/10
[1m259/25

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


  self._warn_if_super_not_called()


[1m613/652[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m4s[0m 123ms/step - accuracy: 0.5065 - loss: 0.9994

2024-09-22 00:06:11.423016: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng4{k11=1} for conv (f32[20,32,127,127]{3,2,1,0}, u8[0]{0}) custom-call(f32[20,64,125,125]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config":{"conv_result_scale":1,"activation_mode":"kNone","side_input_scale":0,"leakyrelu_alpha":0}} is taking a while...
2024-09-22 00:06:11.597616: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.17473602s
Trying algorithm eng4{k11=1} for conv (f32[20,32,127,127]{3,2,1,0}, u8[0]{0}) custom-call(f32[20,64,125,125]{3,2,1,0}, f32[64,32,3,3]{3,2,1,0}), window={size=3x3}, dim_labels=bf01_oi01->bf01, custom_call_target="__cudnn$convBackwardInput", backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"cudnn_conv_backend_config"

[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 163ms/step - accuracy: 0.5112 - loss: 0.9898 - val_accuracy: 0.6733 - val_loss: 0.7141
Epoch 2/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 149ms/step - accuracy: 0.7027 - loss: 0.6506 - val_accuracy: 0.7372 - val_loss: 0.5887
Epoch 3/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 143ms/step - accuracy: 0.8171 - loss: 0.4394 - val_accuracy: 0.8051 - val_loss: 0.5138
Epoch 4/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 144ms/step - accuracy: 0.9275 - loss: 0.2047 - val_accuracy: 0.8485 - val_loss: 0.4664
Epoch 5/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 146ms/step - accuracy: 0.9740 - loss: 0.0861 - val_accuracy: 0.8571 - val_loss: 0.5039
Epoch 6/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 150ms/step - accuracy: 0.9874 - loss: 0.0572 - val_accuracy: 0.8667 - val_loss: 0.5777
Epoch 7/10
[1m652/6

In [198]:
import os
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Assuming 'expanded_test_desc' already exists and contains the image paths and row IDs
# Create a directory for the models
models_directory = '/kaggle/working'  # Update with your actual models directory

# Initialize an empty list to collect predictions
submission_data = []

# Get unique series descriptions from the existing DataFrame
series_descriptions = expanded_test_desc['series_description'].unique()
series_df = expanded_test_desc[expanded_test_desc['series_description'] == series]
series_df

Unnamed: 0,study_id,series_id,series_description,image_path,condition,row_id,instance_number
0,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l1_l2,20
1,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l2_l3,15
2,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l3_l4,23
3,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l4_l5,22
4,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l5_s1,5
5,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l1_l2,12
6,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l2_l3,8
7,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l3_l4,13
8,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l4_l5,24
9,44036939,2828203845,Sagittal_T1,/kaggle/working/RSNA_test_images_png/44036939/...,left_neural_foraminal_narrowing,44036939_left_neural_foraminal_narrowing_l5_s1,17


In [211]:
# Loop through each unique series description
for series in series_descriptions:
    print(f"\nProcessing series: {series}")
    
    # Load the corresponding model
    model_path = os.path.join(models_directory, f"{series}_model.h5")
    
    try:
        model = load_model(model_path)
    except Exception as e:
        print(f"Error loading model for {series}: {e}")
        continue
    
    # Filter the DataFrame for the current series
    series_df = expanded_test_desc[expanded_test_desc['series_description'] == series]
    
    # Create a test data generator
    test_datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=expanded_test_desc,
        x_col='image_path',
        y_col=None,  # No labels for test data
        class_mode=None,
        target_size=(256, 256),  # Adjust to your model's expected input size
        batch_size=32,
        shuffle=False,
        seed=42
    )
    
    # Make predictions
    predictions = model.predict(test_generator, verbose=1)

    # Convert predictions to probabilities
    for idx, row in enumerate(series_df.itertuples()):
        row_id = row.row_id
        pred = predictions[idx]

        # Assuming the model outputs class scores for normal/mild, moderate, and severe
        normal_mild_prob = pred[0]  # Replace with appropriate index if necessary
        moderate_prob = pred[1]      # Replace with appropriate index if necessary
        severe_prob = pred[2]        # Replace with appropriate index if necessary

        # Append the results to the submission data
        submission_data.append({
            'row_id': row_id,
            'normal_mild': normal_mild_prob,
            'moderate': moderate_prob,
            'severe': severe_prob
        })

# Create a DataFrame for submission
submission_df = pd.DataFrame(submission_data)


# Set the display format for floating-point numbers to show decimals
pd.options.display.float_format = '{:.8f}'.format  # Change the number of decimal places as needed


# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Submission DataFrame created successfully.")



Processing series: Sagittal_T1
Found 194 validated image filenames.


  self._warn_if_super_not_called()


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 84ms/step

Processing series: Axial_T2
Found 194 validated image filenames.
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step

Processing series: Sagittal_T2_STIR
Found 194 validated image filenames.
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 85ms/step
Submission DataFrame created successfully.


In [213]:
expanded_test_desc["row_id"].value_counts()

row_id
44036939_spinal_canal_stenosis_l4_l5               10
44036939_spinal_canal_stenosis_l5_s1               10
44036939_spinal_canal_stenosis_l3_l4               10
44036939_spinal_canal_stenosis_l2_l3               10
44036939_spinal_canal_stenosis_l1_l2               10
44036939_right_subarticular_stenosis_l4_l5         10
44036939_right_subarticular_stenosis_l3_l4         10
44036939_left_subarticular_stenosis_l2_l3          10
44036939_left_subarticular_stenosis_l1_l2          10
44036939_right_subarticular_stenosis_l5_s1          9
44036939_left_subarticular_stenosis_l4_l5           9
44036939_left_subarticular_stenosis_l3_l4           9
44036939_left_subarticular_stenosis_l5_s1           9
44036939_right_subarticular_stenosis_l1_l2          9
44036939_right_subarticular_stenosis_l2_l3          9
44036939_right_neural_foraminal_narrowing_l4_l5     5
44036939_left_neural_foraminal_narrowing_l2_l3      5
44036939_left_neural_foraminal_narrowing_l1_l2      5
44036939_right_neural

In [215]:
# Count occurrences of each unique row_id
row_id_counts = submission_df['row_id'].value_counts()

# Group by 'row_id' and calculate the mean for each group
mean_df = submission_df.groupby('row_id').mean().reset_index()

# Display the mean DataFrame
mean_df

Unnamed: 0,row_id,normal_mild,moderate,severe
0,44036939_left_neural_foraminal_narrowing_l1_l2,0.97693336,0.01920432,0.00386232
1,44036939_left_neural_foraminal_narrowing_l2_l3,0.80089253,9.997e-05,0.19900757
2,44036939_left_neural_foraminal_narrowing_l3_l4,0.71357715,0.08772567,0.19869719
3,44036939_left_neural_foraminal_narrowing_l4_l5,0.81340468,0.00016057,0.18643481
4,44036939_left_neural_foraminal_narrowing_l5_s1,0.98027766,0.00843001,0.01129233
5,44036939_left_subarticular_stenosis_l1_l2,0.6069653,0.06016701,0.33286768
6,44036939_left_subarticular_stenosis_l2_l3,0.60045409,0.06645779,0.33308813
7,44036939_left_subarticular_stenosis_l3_l4,0.27365351,0.31228238,0.41406414
8,44036939_left_subarticular_stenosis_l4_l5,0.36775663,0.60829788,0.02394545
9,44036939_left_subarticular_stenosis_l5_s1,0.44782338,0.43665195,0.11552463
