# BoneawareAI

Authors: Karthik Subramanian, Charles Green, Sai Anurag Pichika, Saarang Prabhuram


## Setup

### Load Extensions

Before getting started we need to run some standard code to set up our environment. You'll need to execute this code again each time you start the notebook.

First, run this cell to load the [autoreload](https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html?highlight=autoreload) extension. This enables us to modify `.py` source files and reintegrate them into the notebook, ensuring a smooth editing and debugging experience.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

### Google Colab Setup
Next we need to run a few commands to set up our environment on Google Colab. If you are running this notebook on a local machine you can skip this section.

Run the following cell to mount your Google Drive. Follow the link, sign in to your Google account (the same account you used to store this notebook!).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
PROJECT_PATH = 'BoneawareAI'
GOOGLE_DRIVE_PATH = f'/content/drive/MyDrive/{PROJECT_PATH}'
os.chdir(GOOGLE_DRIVE_PATH)
os.getcwd()

In [None]:
import sys
sys.path.append(GOOGLE_DRIVE_PATH) # this is important for the imports in the .py files to work

In [None]:
%pip install pyyaml==5.4.1
%pip install boto3
%pip install configparser
%pip install torch

### Local Setup OR Google Drive
Run the cell below regardless of whether you are using google drive or local setup.

In [None]:
# if running locally set GOOGLE PATH
import sys
isLocal = False
if 'google.colab' in sys.modules:
  print(f'Running in google colab. Our path is `{GOOGLE_DRIVE_PATH}`')
else:
  GOOGLE_DRIVE_PATH = '.'
  print('Running locally.')
  isLocal = True

### Imports

In [None]:
# RUN LOCALLY
import sys
if isLocal:
    sys.path.append('../src')  # Add the 'src' folder to Python's module search path
    sys.path.append('../datasets')  # Add the 'datasets' folder to Python's module search path
    sys.path.append('../notebooks')  # Add the 'notebooks' folder to Python's module search path
    print('Modules added correctly, locally.')
else:
    sys.path.append('src')  # Add the 'src' folder to Python's module search path
    sys.path.append('datasets')  # Add the 'datasets' folder to Python's module search path
    sys.path.append('notebooks')  # Add the 'notebooks' folder to Python's module search path
    print('Modules added correctly on colab.')

In [None]:
from image_utils import set_seed, MURADataset, load_data, confirm_images_and_labels, count_body_parts, count_positive_negative, count_body_parts_with_augmentations, analyze_models

In [None]:
from metrics import plot_confusion_matrix, plot_roc_curve, compute_class_weights, calculate_metrics, calculate_metrics_per_body_part, evaluate_model

In [None]:
import torch

device = 'mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device = " + device)
if device == 'cpu':
    print("WARNING: Using CPU will cause slower train times")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### Set Seed

This is so the results can be duplicated, ensure that the seed is set in the image_utils.py file, if you want a random seed, import random and set a random number

In [None]:
set_seed(42)

In [None]:
import os
import torch
import random
import numpy as np
import pandas as pd
from torch import nn, optim
from torchvision.transforms import functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm

## Data Preprocessing
Get the dataset, perform data augmentation to get finalized MURA dataset

In [None]:
# Downloading MURA dataset and unzipping the file (this one takes time)
from data_loader import download_dataset
from constants import DATASETS_FOLDER, MURA_DATASET
from helpers.utils import unzip_file

if (isLocal):
    # Define the parent directory and dataset path
    parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Go to the parent directory
    datasets_folder = os.path.join(parent_dir, DATASETS_FOLDER)   # Define datasets folder in the parent directory
    dataset_path = os.path.join(datasets_folder, MURA_DATASET)    # Full path to the dataset file
else:
    datasets_folder = os.path.join(GOOGLE_DRIVE_PATH, DATASETS_FOLDER) # Define datasets folder in the parent directory
    dataset_path = os.path.join(datasets_folder, MURA_DATASET) # Full path to the dataset file

# Ensure the datasets folder exists
os.makedirs(datasets_folder, exist_ok=True)

# Check if the dataset is already downloaded
if not os.path.exists(dataset_path):
    print(f"{MURA_DATASET} not found in {DATASETS_FOLDER}. Downloading and extracting...")
    # Download and unzip the dataset
    download_dataset(MURA_DATASET, datasets_folder)
    unzip_file(dataset_path)
else:
    print(f"{MURA_DATASET} already exists in {DATASETS_FOLDER}. Skipping download.")

In [None]:
from helpers.model_utils import get_hyperparameters

# Retrieving all hyper parameters
lr, weight_decay, num_epochs, step_size, gamma, batch_size, factor, patience = get_hyperparameters() 

In [None]:
if (isLocal):
    data_dir = "../datasets/MURA-v1.1"
else:
    data_dir = os.path.join(datasets_folder, 'MURA-v1.1')

# Load training and validation data
train_loader, valid_loader = load_data(data_dir, batch_size=batch_size)

In [None]:
print("Training Data:")
for batch in train_loader:
    images, labels = batch
    print(f"Batch size: {len(images)}, Labels: {labels}")
    break

# Test the validation DataLoader
print("Validation Data:")
for images, labels in valid_loader:
    print(f"Batch size: {len(images)}, Labels: {labels}")
    break

In [None]:
# Access the datasets from the DataLoaders
train_dataset = train_loader.dataset
valid_dataset = valid_loader.dataset

# Example: Print the length of the datasets
print(f"Number of samples in the training dataset: {len(train_dataset)}")
print(f"Number of samples in the validation dataset: {len(valid_dataset)}")

In [None]:
#16 minutes to confirm on local, does not need to run as you can always use the dataset to confirm as well
#confirm_images_and_labels(train_dataset, "train")
#confirm_images_and_labels(valid_dataset, "valid")

In [None]:
count_body_parts(train_dataset, "train")
count_body_parts(valid_dataset, "valid")

In [None]:
# Example usage with 3 augmentations,  adjust the augmentations as needed
count_body_parts_with_augmentations(train_dataset, "train", num_augmentations=3)
count_body_parts_with_augmentations(valid_dataset, "valid", num_augmentations=3)

In [None]:
# Count positive/negative cases in the training dataset (with 3 augmentations)
count_positive_negative(train_dataset, "train", num_augmentations=3)

# Count positive/negative cases in the validation dataset (with 3 augmentations)
count_positive_negative(valid_dataset, "valid", num_augmentations=3)

### Other Datasets

# Model

### DenseNet

In [None]:
from helpers.model_utils import get_model

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the Model
model_name = "vgg" # "densenet", "resnet", "vgg", "custom_cnn1"
model = get_model(model_name, device)


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Dynamically compute class weights\n",
def compute_class_weights(dataset):
    """
    Computes class weights efficiently for binary classification.

    Args:
        dataset (Dataset): The training dataset.

    Returns:\n",
        torch.Tensor: Class weights as a tensor.
    """
    # Extract labels from label_map
    labels = list(dataset.label_map.values())

    # Use scikit-learn to compute balanced class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)

    # Convert weights to a PyTorch tensor
    return torch.tensor(class_weights, dtype=torch.float)

In [None]:
import torch.optim as optim

# 6. Define Loss, Optimizer, and Scheduler
# Compute class weights dynamically
weights = compute_class_weights(train_dataset).to(device)
# Define loss function with dynamic weights
criterion = nn.BCEWithLogitsLoss(pos_weight=weights[1])
# Define optimizer with adjusted weight decay
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
# Use ReduceLROnPlateau for adaptive learning rate adjustment
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience, verbose=True)


In [None]:
print(f"Computed Class Weights:", {weights})

In [None]:
from sklearn.metrics import (
    confusion_matrix, roc_auc_score, classification_report,
    precision_recall_fscore_support, cohen_kappa_score, roc_curve
)

In [None]:
from helpers.trainer import train_model
# 8. Train the Model
model, train_history = train_model(model, criterion, optimizer, scheduler, train_loader, valid_loader, num_epochs=num_epochs, device=device)

In [None]:
model_name = "Custom_VGG13_BN.pth"
# Save the model
torch.save(model.state_dict(), f'{model_name}')
# Load the model
if device == 'cuda':
    model.load_state_dict(torch.load(model_name))
elif device == 'mps':
    model.load_state_dict(torch.load(model_name, map_location=torch.device('mps')))
else:
    model.load_state_dict(torch.load(model_name, map_location=torch.device('cpu')))
# Evaluate on validation set
model.eval()

In [None]:
# Get number of parameters and model size
models = ["densenet_mura.pth"] 
results = analyze_models(models)

for model_path, details in results.items():
    print(f"File: {model_path}")
    print(f"  Number of parameters: {details['num_parameters']}")
    print(f"  Model size: {details['model_size_mb']:.2f} MB")

In [25]:
# Evaluate on validation set
metrics = evaluate_model(model, valid_loader, dataset=valid_dataset, criterion=criterion)
display(metrics)

## Visualizations

In [None]:
from visualizer import find_last_conv_layer, run_gradcam, GradCAM, run_gradcam_filtered, run_gradcam_for_path_person_or_bodypart

In [None]:
layer_name, target_layer = find_last_conv_layer(model)
print(layer_name)
print(target_layer)


In [None]:
#Generates minimum n images or batch size images from first batch
run_gradcam(model, valid_loader, target_layer, class_names=["Normal", "Abnormal"], device=device, num_images=2)

In [None]:
body_parts = ["XR_ELBOW", "XR_FINGER", "XR_FOREARM", "XR_HAND", "XR_HUMERUS", "XR_SHOULDER", "XR_WRIST"]

In [None]:
#Get 3 images from each bodypart
for x in body_parts:
    run_gradcam_filtered(
        model=model,
        dataloader=valid_loader,
        target_layer=target_layer,
        class_names=["Normal", "Abnormal"],
        body_part=x,
        n=1,
        device=device
    )

In [None]:
#Specific Image Path, must have these slashes /
run_gradcam_for_path_person_or_bodypart(
    model=model,
    dataloader=valid_loader,
    target_layer=target_layer,
    class_names=["Normal", "Abnormal"],
    image_path="MURA-v1.1/valid/XR_ELBOW/patient11186/study1_positive/image1.png",
    device=device
)

In [None]:
#Specific Body Part on a Person
run_gradcam_for_path_person_or_bodypart(
    model=model,
    dataloader=valid_loader,
    target_layer=target_layer,
    class_names=["Normal", "Abnormal"],
    person_id="patient11185",
    body_part="XR_WRIST",
    device=device
)


In [None]:
#Specific Person
run_gradcam_for_path_person_or_bodypart(
    model=model,
    dataloader=valid_loader,
    target_layer=target_layer,
    class_names=["Normal", "Abnormal"],
    person_id="patient11188",
    device=device
)
