<a href="https://colab.research.google.com/github/la26092002/API_Blog_express_Mongodb/blob/main/PART4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Install necessary libraries
!pip install kagglehub SimpleITK

# Imports
import os
import numpy as np
import pandas as pd
import SimpleITK as sitk
from skimage.transform import resize
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.models import Model
import warnings
warnings.filterwarnings('ignore')

# Download dataset from KaggleHub
import kagglehub
path = kagglehub.dataset_download("fanbyprinciple/luna-lung-cancer-dataset")
subset_path = os.path.join(path, 'seg-lungs-LUNA16/seg-lungs-LUNA16/')
annotations_path = os.path.join(path, 'annotations.csv')
annotations_df = pd.read_csv(annotations_path)

# Load pre-trained ResNet50
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg', input_shape=(224, 224, 3))
model = Model(inputs=base_model.input, outputs=base_model.output)

# Helper function: Load all non-zero slices
def load_all_nonzero_slices(filepath, img_size=(224, 224)):
    itk_image = sitk.ReadImage(filepath)
    img_array = sitk.GetArrayFromImage(itk_image)

    non_zero_slices = []
    for slice_2d in img_array:
        if np.any(slice_2d):
            slice_2d = slice_2d.astype(np.float32)
            slice_2d = (slice_2d - np.min(slice_2d)) / (np.max(slice_2d) - np.min(slice_2d) + 1e-5)
            resized_img = resize(slice_2d, img_size, anti_aliasing=True)
            rgb_img = np.stack([resized_img] * 3, axis=-1)
            non_zero_slices.append(rgb_img)
    return non_zero_slices

# Helper function: Process a list of files
def extract_features_from_file_list(file_list, img_size=(224, 224)):
    features = []
    labels = []

    for file_path in file_list:
        try:
            file = os.path.basename(file_path)
            file_id = os.path.splitext(file)[0]
            slices = load_all_nonzero_slices(file_path, img_size)

            matching = annotations_df[annotations_df['seriesuid'] == file_id]
            label = 1 if not matching.empty and matching['diameter_mm'].values[0] > 3 else 0

            for img in slices:
                img_array = img_to_array(img)
                img_array = np.expand_dims(img_array, axis=0)
                img_array = preprocess_input(img_array)

                feature = model.predict(img_array, verbose=0).squeeze()
                features.append(feature)
                labels.append(label)

            print(f"✔ Processed: {file} | Slices used: {len(slices)}")

        except Exception as e:
            print(f"❌ Error processing {file}: {e}")

    return np.array(features), np.array(labels)

# Get all .mhd file paths and split into 4 parts
mhd_files = []
for root, dirs, files in os.walk(subset_path):
    for file in files:
        if file.endswith(".mhd"):
            mhd_files.append(os.path.join(root, file))
mhd_files.sort()

split_size = len(mhd_files) // 4
files_part1 = mhd_files[:split_size]
files_part2 = mhd_files[split_size:split_size*2]
files_part3 = mhd_files[split_size*2:split_size*3]
files_part4 = mhd_files[split_size*3:]


Mounted at /content/drive
Collecting SimpleITK
  Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Downloading SimpleITK-2.4.1-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (52.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.4.1
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Part 4 - Process fourth 1/4
features4, labels4 = extract_features_from_file_list(files_part4)
df4 = pd.DataFrame(features4)
df4['label'] = labels4

# Save to Drive
output_path4 = '/content/drive/MyDrive/lung_features_part4.csv'
df4.to_csv(output_path4, index=False)

print(f"✅ Features Part 4 saved to: {output_path4}")


✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.334022941831199910030220864961.mhd | Slices used: 156
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.334105754605642100456249422350.mhd | Slices used: 225
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.334166493392278943610545989413.mhd | Slices used: 194
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.334184846571549530235084187602.mhd | Slices used: 133
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.334517907433161353885866806005.mhd | Slices used: 260
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.335866409407244673864352309754.mhd | Slices used: 253
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.336102335330125765000317290445.mhd | Slices used: 223
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.336198008634390022174744544656.mhd | Slices used: 117
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.336225579776978874775723463327.mhd | Slices used: 124
✔ Processed: 1.3.6.1.4.1.14519.5.2.1.6279.6001.336894364358709782463716339027.mhd | Slices 