## Import Libraries

## Load Data - Create Train Generator - Train Model

In [1]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from PIL import Image

# Load and preprocess the dataset
data_path = r"C:\Users\kelly\Desktop\New folder\Data_Entry_2017_v2020.csv"
image_dir = r"C:\Users\kelly\Desktop\New folder\images\image_com"
bbox_path = r"C:\Users\kelly\Desktop\New folder\BBox_List_2017.csv"

df = pd.read_csv(data_path)
bbox_df = pd.read_csv(bbox_path)

# Binary classification: 1 for any disease, 0 for 'No Finding'
df['binary_label'] = df['Finding Labels'].apply(lambda x: 0 if x == 'No Finding' else 1)

# Split the data
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['binary_label'], random_state=42)

# Data Generator with Bounding Box Cropping
class DataGenerator(Sequence):
    def __init__(self, dataframe, bbox_df, batch_size=16):
        self.dataframe = dataframe
        self.bbox_df = bbox_df
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.ceil(len(self.dataframe) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch = self.dataframe.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
        imgs = []
        for _, row in batch.iterrows():
            img_path = os.path.join(image_dir, row['Image Index'])
            img = self.crop_image_to_bbox(img_path, row['Image Index'])
            img = img.resize((224, 224))
            img_array = image.img_to_array(img)
        
            # Ensure the image has 3 channels
            if img_array.shape[-1] == 1:  # Grayscale image, needs to be expanded to 3 channels
                img_array = np.repeat(img_array, 3, axis=-1)
            elif img_array.shape[-1] > 3:  
                img_array = img_array[:, :, :3]  # Keep only the first 3 channels
        
            img_array = preprocess_input(img_array)  # Preprocess the image for ResNet50
            imgs.append(img_array)

        imgs = np.stack(imgs)  # Combine into a batch
        labels = to_categorical(batch['binary_label'].values, num_classes=2)
        return imgs, labels


    def crop_image_to_bbox(self, image_path, image_name):
        if image_name in self.bbox_df['Image Index'].values:
            img = Image.open(image_path)
            bbox = self.bbox_df.loc[self.bbox_df['Image Index'] == image_name].iloc[0]
            img = img.crop((bbox['Bbox [x'], bbox['y'], bbox['Bbox [x'] + bbox['w'], bbox['y'] + bbox['h]']))
            return img
        else:
            return Image.open(image_path)

train_generator = DataGenerator(train_df, bbox_df)
val_generator = DataGenerator(val_df, bbox_df)

# Model Architecture with Dropout
base_model = ResNet50(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(2, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = True

model.compile(optimizer=SGD(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit(train_generator, validation_data=val_generator, epochs=5)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1fa2d43d930>

In [3]:
model.save_weights("C://Users//kelly//Desktop//New folder//resnet1_bbox", save_format = 'tf')

In [4]:
model.save_weights("C://Users//kelly//Desktop//New folder//resnet1_bbox.h5")

## Create Test Generator

In [6]:
class TestDataGenerator(Sequence):
    def __init__(self, image_paths, bbox_df, image_dir, batch_size=16):
        self.image_paths = image_paths
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.image_paths) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_paths = self.image_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        imgs = []
        for img_path in batch_paths:
            img = image.load_img(img_path, target_size=(224, 224))
            img_array = image.img_to_array(img)
        
            # Ensure the image has 3 channels
            if img_array.shape[-1] == 1:  # Grayscale image, needs to be expanded to 3 channels
                img_array = np.repeat(img_array, 3, axis=-1)
            elif img_array.shape[-1] > 3:  
                img_array = img_array[:, :, :3]  # Keep only the first 3 channels
        
            img_array = preprocess_input(img_array)  # Preprocess the image for ResNet50
            imgs.append(img_array)

        imgs = np.stack(imgs)  # Combine into a batch
        
        return imgs

## Load Test Images

In [7]:
test_image_dir = r'C:\Users\kelly\Desktop\New folder\eval_xray_im'
test_image_paths = [os.path.join(test_image_dir, img) for img in os.listdir(test_image_dir)]

#Ensure the paths are sorted
test_image_paths.sort()

test_generator = TestDataGenerator(test_image_paths, bbox_df, image_dir)

## Predictions

In [8]:
predictions = model.predict(test_generator)
predicted_classes = np.argmax(predictions, axis=1)
np.save("C://Users//kelly//Desktop//New folder//resnet1_bbox//predictions_resnet1.npy", predictions)



In [9]:
#Prepare submission
submission_df = pd.DataFrame({
    'Id': [os.path.basename(path) for path in test_image_paths],
    'Label': predicted_classes
})

In [10]:
submission_df

Unnamed: 0,Id,Label
0,00000.jpg,0
1,00001.jpg,1
2,00002.jpg,1
3,00003.jpg,0
4,00004.jpg,1
...,...,...
5995,05995.jpg,1
5996,05996.jpg,1
5997,05997.jpg,0
5998,05998.jpg,1


In [11]:
submission_csv_path = 'submission.csv'
submission_df.to_csv(submission_csv_path, index=False)
print(f"Submission file saved to {submission_csv_path}")

Submission file saved to submission.csv
