Fetching Data and preparing dataframe

In [None]:
from google.colab import files

# Upload your kaggle.json file
files.upload()

Saving kaggle.json to kaggle (2).json


{'kaggle (2).json': b'{"username":"ayushroy7","key":"5ac198f048d3f3d167d35581f9cacd12"}'}

In [None]:
!kaggle datasets download -d nih-chest-xrays/data

Dataset URL: https://www.kaggle.com/datasets/nih-chest-xrays/data
License(s): CC0-1.0
data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
import pandas as pd

# Open the zip file
with zipfile.ZipFile('data.zip', 'r') as zip_ref:  # Replace 'data.zip' with the actual zip file name

    # List all files in the zip
    all_files = zip_ref.namelist()

    # Filter to extract only images_010 folder and Data_Entry_2017.csv
    files_to_extract = [f for f in all_files if 'images_002' in f or 'Data_Entry_2017.csv' in f or 'images_003' in f]

    # Extract only the relevant files
    for file in files_to_extract:
        zip_ref.extract(file, 'data')  # Extract into 'data' folder

# Load the Data_Entry_2017.csv into a DataFrame
df = pd.read_csv('data/Data_Entry_2017.csv')

df = df[df['Image Index'].str.contains('_002|_003')]

Deep Learning Data Prep

In [None]:
import tensorflow as tf
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import os

In [None]:
all_labels = ['Atelectasis', 'Consolidation', 'Mass', 'Infiltration', 'No Finding', 'Pneumothorax', 'Effusion', 'Nodule','Pleural_Thickening','Emphysema','Edema','Cardiomegaly','Fibrosis','Pneumonia','Hernia']

for label in all_labels:
  df[label] = df['Finding Labels'].apply(lambda x: 1.0 if label in x else 0.0)


directory_002 = 'data/images_002/images'
directory_003 = 'data/images_003/images'


df['Image Path'] = df['Image Index'].apply(lambda x: os.path.join(directory_002 if '_002' in x else directory_003, x))

if 'Image Path' not in df.columns or not all(label in df.columns for label in all_labels):
    print("Error: Missing 'Image Path' or label columns in the dataframe")

#Create ImageDataGenerator for training and validation sets
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    validation_split=0.2
)


train_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=None,
    x_col='Image Path',
    y_col=all_labels,
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='raw',
    batch_size=32,
    subset='training',
    shuffle=True
)

validation_generator = datagen.flow_from_dataframe(
    dataframe=df,
    directory=None,
    x_col='Image Path',
    y_col=all_labels,
    target_size=(224, 224),
    color_mode='rgb',
    class_mode='raw',
    batch_size=32,
    subset='validation',
    shuffle=False
)


Found 1306 validated image filenames.
Found 326 validated image filenames.




Model

In [None]:
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Dense
from tensorflow.keras.models import Sequential

base_mobilenet_model = MobileNet(input_shape=(224, 224, 3), include_top=False, weights=None)

model = Sequential()
model.add(base_mobilenet_model)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(all_labels), activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['binary_accuracy', 'mae'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def train_and_get_info(model, train_generator, validation_generator, epochs=25):

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )


    history = model.fit(
        train_generator,
        validation_data=validation_generator,
        epochs=epochs,
        callbacks=[early_stopping]
    )


    final_weights = np.concatenate([w.flatten() for w in model.get_weights()])


    model_name = model.__class__.__name__


    training_info = {
        "model_used": model_name,
        "optimizer_used": type(model.optimizer).__name__,
        "loss_function_used": model.loss,
        "final_weights": final_weights.tolist()
    }

    return training_info

# Example usage after defining the model, train_generator, and validation_generator
training_info = train_and_get_info(model, train_generator, validation_generator, epochs=1)

  self._warn_if_super_not_called()


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 9s/step - binary_accuracy: 0.8887 - loss: 0.3013 - mae: 0.1518 - val_binary_accuracy: 0.9368 - val_loss: 0.3858 - val_mae: 0.3094


In [None]:
import json
with open('training_info.json', 'w') as json_file:
    json.dump(training_info, json_file)