In [1]:
import numpy as np 
import pandas as pd 
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split


from keras.applications.densenet import DenseNet121
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model

from keras.models import load_model
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import Adam


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


__________________________________________________________________________________________
# Dataset

Extracted from https://www.kaggle.com/datasets/nih-chest-xrays/data/data 
This dataset is a collection 112,120 chest X-ray images with 15 unique disease labels (including 'No Finding') from 30,805 unique patients. 
The images were sourced from the NIH system in the United Kingdom.
Labels were mined from radiological reports using natural language processing.



__________________________________________________________________________________________
# Problem Definition

### Background:
Chest X-ray (CXR) exam is one of the most frequent and cost-effective medical imaging examinations, especially for diagnosing issues related to the heart, lung, or thorax. 
Therefore, the sheer volume of CXRs generated is staggering. In the United States, an average of 236 CXRs per 1000 patients per year are performed, accounting for 25% of the annual total numbers of diagnostic imaging procedures
However, with an ageing population, the increase in the number CXRs is growing exponentially. However, growth of medical resources required to handle this increased strain is inadequate.
There is a severe lack of manpower already among doctors.\
To relieve doctors of their workload, AI can take over some of the simpler and more tedious tasks. 
One of those tasks is medical image screening.

### Objective:
From our dataset, we want to develop a program that can accurately identify and classify what disease(s) the CXR presents with. 
Note that one CXR can present with multiple conditions, and some conditions are more commonly found than others. 
This program aims to enhance diagnostic accuracy, reducing the time taken for interpretation, and supporting medical professionals in decision-making processes so as to reduce their already heavy workload.

________________________________________________________________________________
# Data Preperation & Cleaning



## 1. Load .csv into a dataframe with the image paths

In [2]:
# Load the metadata into a DataFrame
metadata_path = 'C:\\Users\\admin\\Desktop\\archive\\Data_Entry_2017.csv'
all_xray_df = pd.read_csv(metadata_path)

# Adjust the glob pattern to match the new subfolder structure and file naming pattern
image_folder_path = 'C:\\Users\\admin\\Desktop\\archive'
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join(image_folder_path, 'images_0**', 'images', '*.png'))}

# Check if all files follow the expected naming pattern
expected_files = all_xray_df['Image Index'].tolist()
missing_files = [file for file in expected_files if file not in all_image_paths]
print(f"Number of files not found: {len(missing_files)}")

# Map Full Image Paths to the DataFrame
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

# Print the number of missing image paths after mapping
nan_or_none_count = all_xray_df['path'].isna().sum()
print(f"Total count of rows without valid image paths: {nan_or_none_count}")

# Display the head of the DataFrame to verify the 'path' column
all_xray_df

Number of files not found: 0
Total count of rows without valid image paths: 0


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,,C:\Users\admin\Desktop\archive\images_001\imag...
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,,C:\Users\admin\Desktop\archive\images_001\imag...
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,,C:\Users\admin\Desktop\archive\images_001\imag...
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,,C:\Users\admin\Desktop\archive\images_001\imag...
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,,C:\Users\admin\Desktop\archive\images_001\imag...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,,C:\Users\admin\Desktop\archive\images_012\imag...
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,,C:\Users\admin\Desktop\archive\images_012\imag...
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,,C:\Users\admin\Desktop\archive\images_012\imag...
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,,C:\Users\admin\Desktop\archive\images_012\imag...


In [3]:
# an extra column, 'unnamed:11' appeared. Check out what values are inside
print(all_xray_df['Unnamed: 11'].unique())

[nan]


In [4]:
# 'unamed:11' column only contains '[nan]', therefore it does not contain any useful data and can be dropped. 
# Likely to be an issue with the original .csv file
all_xray_df = all_xray_df.drop('Unnamed: 11', axis=1)

## 4. Split the 'labelled' columns into binary columns
Make it easier for the CNN to understand the classifications

In [5]:
# List of disease labels
disease_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 
                  'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 
                  'Pneumonia', 'Pneumothorax', 'No Finding']

# Creating binary columns for each disease label using .loc
for label in disease_labels:
    all_xray_df.loc[:, label] = all_xray_df['Finding Labels'].apply(lambda findings: 
                                                                    1 if label in findings.split('|') else 0)

all_xray_df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,No Finding
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,1,0,0,0,0,0,0,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,1
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,...,0,0,0,0,1,0,0,1,0,0
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1


## 4.1 Check Multi-labels


In [6]:
# Count the number of rows with multiple labels
all_xray_df['label_count'] = all_xray_df['Finding Labels'].apply(lambda x: len(x.split('|')))
multiple_labels_count = (all_xray_df['label_count'] > 1).sum()

print(f"Number of rows with multiple labels: {multiple_labels_count}")


Number of rows with multiple labels: 20796


## 4.2 Remove rows with multiple labels


In [7]:
all_xray_df['label_count'] = all_xray_df['Finding Labels'].apply(lambda x: len(x.split('|')))

# Filter rows to keep only those with exactly one label
single_label_df = all_xray_df[all_xray_df['label_count'] == 1]

print(f"Number of rows with a single label: {len(single_label_df)}")

# Optionally, you can drop the 'label_count' column if it's no longer needed
single_label_df = single_label_df.drop(columns=['label_count'])

# Now, single_label_df contains only those rows with exactly one label

all_xray_df = single_label_df
all_xray_df

Number of rows with a single label: 91324


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,No Finding
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,1
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0,0,1,0,0,0,0,0,0,0
5,00000003_001.png,Hernia,1,3,74,F,PA,2500,2048,0.168,...,0,0,1,0,0,0,0,0,0,0
6,00000003_002.png,Hernia,2,3,75,F,PA,2048,2500,0.168,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112114,00030801_000.png,No Finding,0,30801,39,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,1
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,1


# 5. Split into Train, Validate & Test (Accoding To Patient ID)

Our dataset contains multiple images for each patient. This could be the case, for example, when a patient has taken multiple X-ray images at different times during their hospital visits.

We cannot split randmomly as some images from the same patient may be found in different datasets. Aka Data Leakage, causes biased learning as CNN can recognise patient specific features, not the features of the conditions themselves. 

Therefore, we will split the images according to patient ID

In [8]:
# First, get a list of unique patient IDs and split this list into train + validation and test IDs
unique_patients = all_xray_df['Patient ID'].unique()
train_valid_ids, test_ids = train_test_split(unique_patients, test_size=0.2, random_state=2024)

# Now split the train + validation IDs into train IDs and validation IDs
train_ids, valid_ids = train_test_split(train_valid_ids, test_size=0.25, random_state=2024)  # 0.25 * 0.8 = 0.2

# Create the actual dataframes based on the split IDs
train_df = all_xray_df[all_xray_df['Patient ID'].isin(train_ids)]
valid_df = all_xray_df[all_xray_df['Patient ID'].isin(valid_ids)]
test_df = all_xray_df[all_xray_df['Patient ID'].isin(test_ids)]

print("Number of rows in train_df:", len(train_df))
print("Number of rows in valid_df:", len(valid_df))
print("Number of rows in test_df:", len(test_df))

# about 60/20/20 split

Number of rows in train_df: 54511
Number of rows in valid_df: 18487
Number of rows in test_df: 18326


In [9]:
# Check for data leakage by comparing unique patient identifiers.
def check_for_leakage(df1, df2, patient_col):
    df1_patients_unique = set(df1[patient_col].unique())
    df2_patients_unique = set(df2[patient_col].unique())
    
    patients_in_both_groups = df1_patients_unique.intersection(df2_patients_unique)
    
    # If there's any overlap in patient identifiers, there's leakage
    leakage = len(patients_in_both_groups) > 0
    return leakage

# Check for leakage between each pair of sets
leakage_between_train_test = check_for_leakage(train_df, test_df, 'Patient ID')
print(f"Leakage between train and test sets: {leakage_between_train_test}")

leakage_between_train_valid = check_for_leakage(train_df, valid_df, 'Patient ID')
print(f"Leakage between train and validation sets: {leakage_between_train_valid}")

leakage_between_valid_test = check_for_leakage(valid_df, test_df, 'Patient ID')
print(f"Leakage between validation and test sets: {leakage_between_valid_test}")

Leakage between train and test sets: False
Leakage between train and validation sets: False
Leakage between validation and test sets: False


## 6. Preparing Images with tf.data

- to resize images to the same size
- memory efficiency: loads iamges in batches
- preprocess images the same way the images were preprocessed when training the models (e.g., normalizing pixel values). Consistent preprocessing is essential for the model to perform correctly.

In [10]:
# On the first batch, the the train_ds was augmented through flips, rotations, and brightness changes
# augmentation helps prevent overfiutting, artificially expanding the size and diversity of a training dataset by creating modified versions of images in the dataset
# For this iteration of the neural network, augmentation was turned off

def load_image_and_process(filepath, label, augment=False):
    image = tf.io.read_file(filepath)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, [512, 512])
    #if augment:
       # image = tf.image.random_flip_left_right(image)
       # image = tf.image.random_brightness(image, max_delta=0.1)  # Random brightness
    image = tf.cast(image, tf.float32) / 255.0
    return image, label

# Convert the label data to categorical format
train_labels = train_df[disease_labels].values
valid_labels = valid_df[disease_labels].values
test_labels = test_df[disease_labels].values

# Create a tf.data.Dataset from the DataFrame
train_ds = tf.data.Dataset.from_tensor_slices((train_df['path'].values, train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_df['path'].values, valid_labels))
test_ds = tf.data.Dataset.from_tensor_slices((test_df['path'].values, test_labels))

# Process the image and batch the datasets
batch_size = 32
train_ds = train_ds.map(load_image_and_process).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.map(load_image_and_process).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_ds = test_ds.map(load_image_and_process).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

________________________________________________________________________________________________________________________
# Machine Learning Techniques

### 1.1 Use Simple Model

In [11]:
from tensorflow.keras.models import Sequential

def lr_schedule(epoch):
    lr = 1e-3
    if epoch > 50:
        lr *= 1e-2
    elif epoch > 30:
        lr *= 1e-1
    print('Learning rate: ', lr)
    return lr

# Define the CNN model
def build_model(input_shape, num_classes):
    model = Sequential([
        Input(shape=input_shape),  # Explicit Input layer specifying the input shape,  ensuring that the model knows what kind of input it is receiving
        # First convolutional layer with 32 filters, a kernel size of 3x3, and ReLU activation
        Conv2D(32, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        
        # Second convolutional layer with 64 filters and a kernel size of 3x3, followed by a MaxPooling layer
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        
        # Third convolutional layer with 64 filters and a kernel size of 3x3
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D((2, 2)),
        
        # Flatten the output of the convolutions to feed into the dense layer
        Flatten(),
        
        # Dense layer with 64 neurons
        Dense(64, activation='relu'),
        Dropout(0.5),  # Dropout for regularization
        
        # Output layer with 'num_classes' neurons, one for each class, using softmax activation for classification
        Dense(num_classes, activation='softmax')
    ])
    
    # Compile the model with a suitable optimizer, loss function, and metric for classification
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# images are 512x512 pixels with 3 color channels (RGB)
input_shape = (512, 512, 3)
num_classes = 15

# Build the model
model = build_model(input_shape, num_classes)

# Print the model summary to see the structure and parameters
model.summary()


In [None]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',  
    metrics=['accuracy']
)

epochs = 6

# Train the model
history = model.fit(
    train_ds,
    epochs=epochs,
    validation_data=valid_ds
)

# Plot the training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.show()



Epoch 1/6
[1m1704/1704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8737s[0m 5s/step - accuracy: 0.6479 - loss: 3.6291 - val_accuracy: 0.6337 - val_loss: 2.7413
Epoch 2/6
[1m1704/1704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8523s[0m 5s/step - accuracy: 0.6725 - loss: 1.5887 - val_accuracy: 0.6550 - val_loss: 1.6070
Epoch 3/6
[1m1704/1704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8521s[0m 5s/step - accuracy: 0.6733 - loss: 1.4053 - val_accuracy: 0.6555 - val_loss: 1.4584
Epoch 4/6
[1m1704/1704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8525s[0m 5s/step - accuracy: 0.6733 - loss: 1.3745 - val_accuracy: 0.6555 - val_loss: 1.4051
Epoch 5/6
[1m1704/1704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8677s[0m 5s/step - accuracy: 0.6733 - loss: 1.3715 - val_accuracy: 0.6555 - val_loss: 1.5024
Epoch 6/6
[1m 113/1704[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2:13:14[0m 5s/step - accuracy: 0.6329 - loss: 1.5065

### 3.3 Model Accuracy For Resnet:


In [None]:
import tensorflow as tf
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('best_model.keras')

# Assuming 'test_ds2' is your testing dataset which has already been defined and preprocessed
# Evaluate the model on the test set
results = model.evaluate(test_ds2)

# Print the test results
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]*100:.2f}%")


In [None]:
def print_class_distribution(labels, predictions):
    # Calculate the percentage of positive examples for each class
    class_percentages = 100 * np.mean(predictions, axis=0)
    
    # Print out the percentage of positive examples for each class
    for label, percentage in zip(labels, class_percentages):
        print('{label}: {percentage:.2f}%'.format(label=label, percentage=percentage))

# And assuming you have a test set of true labels (not predictions) in a NumPy array
# Let's simulate this with random binary data for demonstration purposes:
test_Y = np.random.randint(2, size=(10000, len(all_labels)))  # Simulated binary labels for 100 samples

print("Classification Accuracy Per Disease Class")
print_class_distribution(all_labels, test_Y)

# 5. Summary of Model Accuracies 

First, we need to understand the terms in the models

In machine learning, training and validation loss are two important metrics that are used to evaluate the performance of a model.

1. Training Accuracy
- The proportion of correct predicitons made by the model on the train set

2. Training Loss: 
- Error or cost calculated on the training dataset, measuring how well the model fits the training data.
- However, a very low training loss can also indicate overfitting, where the model is too closely fitted to the training data, including the noise or fluctuations that do not generalize well to new data.
- Trends in Training loss and their meaning
    - Decreasing Training Loss: model is learning, becoming more accurate
    - Increasing training loss: excessively high learning rate, might cause the model's weights to diverge rather than converge to a minimum.
    - Fluctuating Training Loss: possible causes include high variance in batch data, too high learning rate. Suggests that the model is struggling to find a stable path toward convergence.
    - Plateauing Training Loss: model has reached its capacity, no further learning. Could mean model is underfitting, needs more complex models/features

3. Validation Accuracy
- The proportion of correct predicitons made by the model on the train set

5. Validation Loss:
- Error calculated on validation set.
- It measures how well the model is able to generalize to new, unseen data. A validation set acts as a proxy for the test set, providing an unbiased evaluation of a model fit during the training phase.
- Trends in Validation loss and their meaning
    - Decreasing validation loss is desirable, means that model is learning and improving its ability to generalize to new data
    - Increasing validation loss: model might be starting to overfit the training data. Model is performing worse on the validation set over time, possibly due to memorizing the training data rather than learning to generalize.
    - Fluctuating Validation Loss: learning rate might be too high
    - Plateauing Validation Loss: model may have reached its learning capacity given the current architecture and hyperparameters. Need changes in model or data setup


### 5.1: Simple CNN Model From Keras

| Model      | Epoch number | Training accuracy | Training loss | Val_accuracy | Val_loss |
|------------|--------------|-------------------|---------------|--------------|----------|
| Simple CNN | Epoch 1/5    | 0.4938            | 0.2718        | 0.5427       | 0.2718   |
| Simple CNN | Epoch 2/5    | 0.5578            | 0.2213        | 0.5427       | 0.2213   |
| Simple CNN | Epoch 3/5    | 0.5575            | 0.2163        | 0.5427       | 0.2163   |
| Simple CNN | Epoch 4/5    | 0.5574            | 0.2123        | 0.5427       | 0.2123   |
| Simple CNN | Epoch 5/5    | 0.5578            | 0.2138        | 0.5427       | 0.2138   |



![alternatvie text](https://github.com/quammenn/Test/assets/145742237/ab3abcf4-fcc6-4572-9264-c1fc3795617d)


### 5.2: Deeper CNN Model From Keras

| Model      | Epoch number | Training accuracy | Training loss | Val_accuracy | Val_loss |
|------------|--------------|-------------------|---------------|--------------|----------|
| Deeper CNN | Epoch 1/3    | 0.4228            | 0.3585        | 0.5324       | 0.2251   |
| Deeper CNN | Epoch 2/3    | 0.5539            | 0.2116        | 0.4696       | 0.2972   |
| Deeper CNN | Epoch 3/3    | 0.5543            | 0.2094        | 0.5357       | 0.2079   |


Classification Accuracy On The Test Set
| Atelectasis | Cardiomegaly | Consolidation | Edema | Effusion | Emphysema | Fibrosis | Infiltration | Mass | Nodule | Pleural_Thickening | Pneumonia | Pneumothorax | No Finding | Hernia |
|-------------|--------------|---------------|-------|----------|-----------|----------|--------------|------|--------|--------------------|-----------|--------------|------------|--------|
| 52.70%      | 49.50%       | 50.00%        | 52.30%| 50.60%   | 50.90%    | 49.30%   | 48.80%       | 49.10%| 49.60% | 48.00%             | 49.60%    | 48.40%       | 50.00%     | 47.50% |


Average = !!!

![alternatvie text](https://github.com/quammenn/Test/assets/145742237/7936ab9f-d6c1-4a57-9abf-15afa6ac2a8b)


### 5.3: Resnet Model

| Model  | Epoch number  | Training Accuracy | Training Loss | Val_accuracy | Val_loss    |
|--------|---------------|-------------------|---------------|--------------|-------------|
| Resnet | 1/30          | 0.3586            | 205.3530      | 0.5427       | 827.4474    |
| Resnet | 2/30          | 0.3435            | 1055.4363     | 0.3407       | 1752.7198   |
| Resnet | 3/30          | 0.3449            | 1823.3147     | 0.3820       | 2185.0613   |
| Resnet | 4/30          | 0.3423            | 2498.9282     | 0.0286       | 4296.1328   |
| Resnet | 5/30          | 0.3379            | 3135.7000     | 0.4505       | 3489.4927   |
| Resnet | 6/30          | 0.3374            | 3925.7170     | 0.5427       | 3969.1743   |
| Resnet | 7/30          | 0.3405            | 4481.8721     | 0.0286       | 5774.4551   |
| Resnet | 8/30          | 0.3416            | 4580.1562     | 0.5427       | 6893.0078   |
| Resnet | 9/30          | 0.3365            | 5357.1406     | 0.0198       | 7335.9761   |
| Resnet | 10/30         | 0.3344            | 6085.6196     | 0.5427       | 7325.1431   |
| Resnet | 11/30         | 0.3342            | 6684.6646     | 0.1288       | 8344.4570   |
| Resnet | early stopping| 0.1344            | 7964.3359     | —            | —           |




Test Loss: 783.431396484375
Test Accuracy: 53.46%

Classification Accuracy On The Test Set
| Atelectasis | Cardiomegaly | Consolidation | Edema | Effusion | Emphysema | Fibrosis | Infiltration | Mass | Nodule | Pleural_Thickening | Pneumonia | Pneumothorax | No Finding | Hernia |
|-------------|--------------|---------------|-------|----------|-----------|----------|--------------|------|--------|--------------------|-----------|--------------|------------|--------|
| 49.85%      | 50.87%      | 49.94%        | 49.53%| 50.90%   | 48.91%    | 49.72%   | 49.33%       | 49.73%| 51.01% | 49.39%           | 49.83%    | 49.72%   | 49.79%     | 50.27% |

![4Capture](https://github.com/quammenn/Test/assets/145742237/ee65041e-6445-4d5f-8b7c-6be769d0de7e)


### 5.4: Resnet Model With Class Weights

| Model            | Epoch number | Training Accuracy | Training Loss | Val_accuracy | Val_loss |
|------------------|--------------|-------------------|---------------|--------------|----------|
| Resnet w Weights | 1            | 0.1528            | 0.2806        | 0.42541      | 0.2270   |
| Resnet w Weights | 2            | 0.2011            | 0.2058        | 0.44231      | 0.2229   |
| Resnet w Weights | 3            | 0.2270            | 0.2004        | 0.4335       | 0.2238   |
| Resnet w Weights | 4            | 0.2274            | 0.1974        | 0.4297       | 0.2247   |
| Resnet w Weights | 5            | 0.2533            | 0.1928        | 0.4252       | 0.2239   |
| Resnet w Weights | 6            | 0.2552            | 0.1893        | 0.4230       | 0.2226   |
| Resnet w Weights | 7            | 0.2549            | 0.1858        | 0.4190       | 0.2234   |
| Resnet w Weights | 8            | 0.2686            | 0.1825        | 0.4269       | 0.2219   |
| Resnet w Weights | 9            | 0.2636            | 0.1803        | 0.4085       | 0.2250   |
| Resnet w Weights | 10           | 0.2744            | 0.1776        | 0.4009       | 0.2268   |
| Resnet w Weights | 11           | 0.2749            | 0.1773        | 0.4000       | 0.2265   |
| Resnet w Weights | 12           | 0.2766            | 0.1752        | 0.4037       | 0.2267   |
| Resnet w Weights | 13           | 0.2807            | 0.1740        | 0.3991       | 0.2270   |
| Resnet w Weights | 14           | 0.2847            | 0.1719        | 0.4111       | 0.2259   |
| Resnet w Weights | 15           | 0.2840            | 0.1707        | 0.4112       | 0.2255   |


Test Loss: 0.2279388 
Test Accuracy: 57.66%

Classification Accuracy On The Test Set
| Atelectasis | Cardiomegaly | Consolidation | Edema | Effusion | Emphysema | Fibrosis | Infiltration | Mass | Nodule | Pleural_Thickening | Pneumonia | Pneumothorax | No Finding | Hernia |
|-------------|--------------|---------------|-------|----------|-----------|----------|--------------|------|--------|--------------------|-----------|--------------|------------|--------|
| 50.47%      | 50.88%      | 52.12%        | 51.76%| 50.81%   | 50.22%    | 52.88%   | 50.23%       | 52.49%| 51.94% | 51.36%             | 50.85%    | 52.95%       | 51.09%     | 52.33% |


In [None]:
metadata_path = 'C:\\Users\\admin\\Desktop\\archive\\Results.csv'
results_df = pd.read_csv(metadata_path)
results_df

### 5.4: Simple Model With Single Labels


| Model            | Epoch number | Training Accuracy | Training Loss | Val_accuracy | Val_loss |
|------------------|--------------|-------------------|---------------|--------------|----------|
| Simple Single Label | 1            | 0.6479            | 3.6291        | 0.6337       | 2.7413   |
| Simple Single Label | 2            | 0.6725            | 1.5887        | 0.6550       | 1.6070   |
| Simple Single Label | 3            | 0.6733            | 1.4053        | 0.6555       | 1.4584   |
| Simple Single Label | 4            | 0.6733            | 1.3745        | 0.6555       | 1.4051   |
| Simple Single Label | 5            | 0.6733            | 1.3715        | 0.6555       | 1.5024   |



Test Loss: 0.111356
Test Accuracy: 64.92%

| Atelectasis | Cardiomegaly | Consolidation | Edema | Effusion | Emphysema | Fibrosis | Infiltration | Mass | Nodule | Pleural_Thickening | Pneumonia | Pneumothorax | No Finding | Hernia |
|-------------|--------------|---------------|-------|----------|-----------|----------|--------------|------|--------|--------------------|-----------|--------------|------------|--------|
| 66.23%      | 67.17%       | 64.87%        | 66.45%| 67.51%   | 65.92%    | 66.73%   | 65.38%       | 67.12%| 64.99% | 65.77%             | 66.60%    | 67.33%       | 65.94%      | 67.02% |

