In [11]:
import pandas as pd
df = pd.read_csv('Data_Entry_2017.csv')

In [19]:
df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,
...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,


### Using only 6000 of the image files

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Sequential
import pandas as pd
import os
import random

# Hyperparameters 
batch_size = 32
image_size = (224, 224)
epochs = 10

# Dataset location 
data_dir = 'E:/archive'
csv_file = os.path.join(data_dir, 'Data_Entry_2017.csv')

# All disease labels
disease_list = ['Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 
                'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 
                'Pleural_Thickening', 'Cardiomegaly', 'Nodule', 'Mass', 'Hernia', 'No Finding']

# Load & Map Labels from CSV
df = pd.read_csv(csv_file)
image_label_dict = df.set_index('Image Index')['Finding Labels'].to_dict()

# Label preprocessing function
def preprocess_labels(labels_string, disease_list):
    labels = labels_string.split("|")
    one_hot_labels = np.zeros(len(disease_list))  
    for label in labels:
        idx = disease_list.index(label)
        one_hot_labels[idx] = 1
    return one_hot_labels

# Modified Preprocessing and Generator
def custom_generator(data_dir, image_label_dict, image_size, batch_size, image_indices, total_batches_per_epoch):
    images = []  
    labels = [] 
    batches_generated = 0 

    while True:
        for folder in os.listdir(data_dir):
            folder_path = os.path.join(data_dir, folder)
            if os.path.isdir(folder_path):  
                images_subdir = os.path.join(folder_path, 'images')  
                if os.path.isdir(images_subdir):  
                    for image_file in os.listdir(images_subdir):
                        image_path = os.path.join(images_subdir, image_file) 
                        image_index = os.path.basename(image_file) 

                        if image_index in image_indices:  # Check if image is in the intended set
                            label = image_label_dict.get(image_index, 'No Finding')  

                            img = tf.io.read_file(image_path)
                            img = tf.image.decode_png(img, channels=3)  
                            img = tf.image.resize(img, image_size)
                            img = tf.keras.applications.vgg16.preprocess_input(img)  
                            images.append(img)  
                            labels.append(preprocess_labels(label, disease_list))  

                            if len(images) == batch_size:
                                yield tf.stack(images), tf.stack(labels)  
                                images, labels = [], [] 
                                batches_generated += 1

                                if batches_generated == total_batches_per_epoch:
                                    batches_generated = 0 
                                    break  

# Model (VGG16)
base_model = tf.keras.applications.VGG16(include_top=False, weights='imagenet', input_shape=image_size + (3,)) 

# Freeze pre-trained layers
for layer in base_model.layers[:-4]:  
    layer.trainable = False 

# Head for our classification 
x = Flatten()(base_model.output)
x = Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2=0.05), bias_regularizer='l1')(x) # Regulation to prevent overfitting
x = Dropout(0.4)(x)  # Add a Dropout layer to prevent overfitting
predictions = Dense(15, activation='sigmoid')(x)  

model = tf.keras.Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Data Splitting (MAKE SURE TO WORK WITH A SMALL SUBSET FOR NOW)
all_image_indices = list(image_label_dict.keys())  
random.shuffle(all_image_indices)  

train_indices = all_image_indices[:15000] 
validation_indices = all_image_indices[15000:20000]  

# Calculate batches per epoch
total_training_samples = len(train_indices)
total_validation_samples = len(validation_indices)
total_batches_per_epoch_train = total_training_samples // batch_size
total_batches_per_epoch_val = total_validation_samples // batch_size

# Train with the custom generator
train_data = custom_generator(data_dir, image_label_dict, image_size, batch_size, train_indices, total_batches_per_epoch_train)
validation_data = custom_generator(data_dir, image_label_dict, image_size, batch_size, validation_indices, total_batches_per_epoch_val) 

print("Total Training Samples:", total_training_samples)
print("Batch Size:", batch_size)
print("Total Batches per Epoch (Train):", total_batches_per_epoch_train)

# Train the model
model.fit(
    train_data,
    epochs=epochs,
    validation_data=validation_data,
    steps_per_epoch=total_batches_per_epoch_train,
    validation_steps=total_batches_per_epoch_val
)

# Save the model
model.save(r"C:\Users\jdori\Downloads\chest_xray_model.h5")



Total Training Samples: 15000
Batch Size: 32
Total Batches per Epoch (Train): 468
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 