# Extract and label Images

In [1]:
import os
import shutil
from PIL import Image, UnidentifiedImageError
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets

# Function to extract and label images by modality
def prepare_data(source_dir, target_dir, modalities):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    for root, dirs, files in os.walk(source_dir):
        for folder in dirs:
            if folder in modalities:
                modality_folder = os.path.join(root, folder)
                target_modality_dir = os.path.join(target_dir, folder)
                os.makedirs(target_modality_dir, exist_ok=True)

                for file in os.listdir(modality_folder):
                    folder_path = os.path.join(modality_folder, file)
                    if os.path.isdir(folder_path):
                        for image in os.listdir(folder_path):
                            file_path = os.path.join(folder_path, image)
                            target_path = os.path.join(target_modality_dir, file)
                            shutil.copy(file_path, target_modality_dir)
                            

# Source and target directories
#source_dir = '/mloscratch/homes/tagemoua/scrap_radiopaedia/radiopaedia'
#target_dir = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data'
#modalities = 'Ultrasound'

# Prepare the dataset
#prepare_data(source_dir, target_dir, modalities)

import json
import os
import shutil

# prepare data chexpert

def prepare_data_chexpert(source_dir, target_dir, modality):
    with open(source_dir, 'r') as f:
        lines = f.readlines()
        for line in lines:
            line_data = json.loads(line.strip())  # Remove whitespace and parse JSON
            jsonl_image_path = line_data['modalities'][0]['value']
            image_path = os.path.join('/mloscratch/homes/multimeditron/dataset/image/PMC-OA', jsonl_image_path)
            #jsonl_image_path = jsonl_image_path.split('/')
            
            target_path = os.path.join(target_dir, modality)
            # Extracting the study ID (assuming it's part of the image path or metadata)
            #study_id = jsonl_image_path[-2] # Replace with actual field if needed
            
            # Create a target directory based on modality
            target_path = os.path.join(target_dir, modality)
            
            # Get the base image filename and create a new filename
            base_name = os.path.basename(image_path)
            new_image_name = f"{study_id}_{base_name}"
            new_image_path = os.path.join(target_path, new_image_name)
            os.makedirs(target_path, exist_ok=True)
            

            print(f"Copying {image_path} to {new_image_path}")
            try:
                shutil.copy(image_path, new_image_path)
                print("Image copied successfully")
            except Exception as e:
                print(f"An error occurred: {e}")
                if image_path.lower().endswith(('.png')):
                    try:
                        new_image_path = image_path.replace('.jjpg')
                        shutil.copy(new_image_path, target_path)
                        print("Image copied successfully")
                    except Exception as e:
                        print(f"An error occurred: {e}")

#source_dir = '/mloscratch/homes/multimeditron/dataset/image/PMC-OA.jsonl'

#target_dir = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data'
#modality = 'General Medecine'
#prepare_data(source_dir, target_dir, modality)



  from .autonotebook import tqdm as notebook_tqdm


# Create subset of the data

In [2]:
import os
import shutil

# Function to create a subset of image files

def create_subset(input_dirs, output_dir, subset_size=100):
    """
    Create a subset of image files from multiple directories.

    :param input_dirs: List of input directories containing image files.
    :param output_dir: Target directory to save the subset.
    :param subset_size: Number of images to take from each input directory.
    """
    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

    for input_dir in input_dirs:
        if not os.path.exists(input_dir):
            print(f"Directory {input_dir} does not exist. Skipping.")
            continue
        
        # List all files in the current directory
        files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
        
        # Limit to the subset size
        subset_files = files[:subset_size]

        # Create a subfolder in the output directory for this class (same name as input folder)
        class_name = os.path.basename(input_dir)
        class_output_dir = os.path.join(output_dir, class_name)
        os.makedirs(class_output_dir, exist_ok=True)

        # Copy the subset files to the output directory
        for file in subset_files:
            src_path = os.path.join(input_dir, file)
            dst_path = os.path.join(class_output_dir, file)
            shutil.copyfile(src_path, dst_path)

        print(f"Copied {len(subset_files)} files from {input_dir} to {class_output_dir}")

         



# Train a model

In [18]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets
from PIL import Image, UnidentifiedImageError

target_dir = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data'

import shutil




new_data = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset'
# Dataset preparation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

data = datasets.ImageFolder(root=new_data, transform=transform)


# Split the dataset into training and validation sets
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_data, val_data = torch.utils.data.random_split(data, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

# Model preparation
model = models.resnet50(pretrained=True)
num_classes = len(data.classes)
model.fc = nn.Linear(model.fc.in_features, num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model with early stopping based on loss and accuracy
num_epochs = 10
best_accuracy = 0.0
prev_loss = float('inf')  # Initialize with a high value

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Compute epoch loss and accuracy
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

    # Early stopping condition
    if epoch > 0 and epoch_loss > prev_loss and epoch_accuracy <= best_accuracy:
        print(f"Stopping early at epoch {epoch+1} due to no improvement in accuracy and an increase in loss.")
        break

    # Update best accuracy and previous loss for comparison
    best_accuracy = max(best_accuracy, epoch_accuracy)
    prev_loss = epoch_loss


    # Validation step
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = 100 * correct / total
    print(f"Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.2f}%")

# Save the trained model
torch.save(model.state_dict(), 'modality_classifier.pth')






  from .autonotebook import tqdm as notebook_tqdm
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/runai-home/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 322MB/s]


Epoch [1/10], Loss: 0.1634, Accuracy: 94.80%
Validation Loss: 0.0917, Accuracy: 97.51%
Epoch [2/10], Loss: 0.0625, Accuracy: 98.06%
Validation Loss: 0.0900, Accuracy: 97.23%
Epoch [3/10], Loss: 0.0479, Accuracy: 98.48%
Validation Loss: 0.1174, Accuracy: 96.36%
Epoch [4/10], Loss: 0.0394, Accuracy: 98.77%
Validation Loss: 0.0819, Accuracy: 97.30%
Epoch [5/10], Loss: 0.0462, Accuracy: 98.64%
Stopping early at epoch 5 due to no improvement in accuracy and an increase in loss.


# Predict the expert to use for the image

In [4]:
import torch
from torchvision import transforms, models
from PIL import Image
import cv2
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets
from PIL import Image, UnidentifiedImageError

new_data = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500'
# Dataset preparation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

data = datasets.ImageFolder(root=new_data, transform=transform)

# Load the trained model
model = models.resnet50(pretrained=False)  # Don't load pretrained weights, we'll load our own
num_classes = len(data.classes)  # Number of classes in the dataset
model.fc = nn.Linear(model.fc.in_features, num_classes)  # Adjust final layer to match the number of classes

# Load the saved model weights
model.load_state_dict(torch.load('modality_classifier_500.pth'))
model.eval()  # Set the model to evaluation mode

# Determine the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  # Move the model to the correct device



# Function to predict a class for a random image
def predict_image(image_path):
    # Open the image using OpenCV (or PIL if preferred)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image from {image_path}")
        return

    # Convert the image to RGB (OpenCV loads images in BGR by default)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Convert the image to PIL format
    image = Image.fromarray(image)

    # Apply transformations
    image = transform(image).unsqueeze(0)  # Add batch dimension

    # Move the image tensor to the same device as the model
    image = image.to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)

    # Get the predicted class label
    predicted_class = data.classes[predicted.item()]

    print(f"Predicted class: {predicted_class}")


# Test the model with a random image
#image_path = '/mloscratch/homes/tagemoua/scrap_radiopaedia/radiopeadia/Ultrasound/abdominal-ectopic-pregnancy-in-the-second-trimester-1/images/0bb8915dd09a1254260d005cf4cae02d82e2d232453adbfefa956065a9cd973e_thumb.jpeg'
#predict_image(image_path)



In [46]:
import torch
import torch.nn as nn
import numpy as np
import cv2
from PIL import Image
from scipy.stats import entropy  # To calculate entropy
from torchvision import models, transforms, datasets

# Dataset directory and transformations
new_data = '/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500'

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

data = datasets.ImageFolder(root=new_data, transform=transform)

# Load the trained model
model = models.resnet50(pretrained=False)  # Don't load pretrained weights
num_classes = len(data.classes)  # Number of classes in the dataset
model.fc = nn.Linear(model.fc.in_features, num_classes)  # Adjust the final layer

# Load the saved model weights
model.load_state_dict(torch.load('modality_classifier_3000.pth', map_location='cpu'))
model.eval()  # Set the model to evaluation mode

# Determine the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  # Move the model to the correct device

# Define entropy threshold
ENTROPY_THRESHOLD = 0.2

def calculate_entropy(probabilities):
    """Calculate the entropy of a probability distribution."""
    return entropy(probabilities, base=2)

def predict_image(image_path):
    # Open the image using OpenCV
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Unable to load image from {image_path}")
        return

    # Convert the image to RGB (OpenCV loads images in BGR by default)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Convert the image to PIL format and apply transformations
    image = Image.fromarray(image)
    image = transform(image).unsqueeze(0)  # Add batch dimension
    image = image.to(device)  # Move image tensor to the same device as the model

    # Perform the prediction
    with torch.no_grad():
        outputs = model(image)  # Model output probabilities
        probabilities = torch.nn.functional.softmax(outputs, dim=1).cpu().numpy().squeeze()

    # Calculate entropy of the current prediction
    entropy_value = calculate_entropy(probabilities)

    # Check entropy and decide the predicted class
    if entropy_value > ENTROPY_THRESHOLD:
        print(f"High entropy ({entropy_value:.4f}). Defaulting to 'General Medicine'.")
        predicted_class = "General Medicine"
    else:
        # Get the predicted class with the highest probability
        predicted_idx = np.argmax(probabilities)
        predicted_class = data.classes[predicted_idx]

    print(f"Predicted class: {predicted_class} (Entropy: {entropy_value:.4f})")

# Test the model with a random image
image_path='/mloscratch/homes/tagemoua/MultiMeditron/image.png'
predict_image(image_path)





Predicted class: General (Entropy: 0.0012)


# Train for multiple data length

## Create data folders for multiple numbers of data points

In [26]:
import matplotlib.pyplot as plt 
import numpy as np

import torch
from torchvision import transforms, models
from PIL import Image
import cv2
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets
from PIL import Image, UnidentifiedImageError


In [16]:


# Number of data points
num_data_points = [200, 500, 1000, 3000]

input_dirs = [
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/General',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/General Medecine',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Mri',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Ultrasound',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Chest_X-ray',
]
for n in num_data_points:
    output_dir = f'/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_{n}'
    create_subset(input_dirs, output_dir, subset_size=n)


    

Copied 200 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/General to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_200/General
Copied 200 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/General Medecine to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_200/General Medecine
Copied 200 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/Mri to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_200/Mri
Copied 200 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/Ultrasound to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_200/Ultrasound
Copied 200 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/Chest_X-ray to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_200/Chest_X-ray
Copied 500 files from /mloscratch/homes/tagemoua/MultiMeditron/processed_data/General to /mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500/

## Train the models and compare the accuracies

In [27]:

# Number of data points
num_data_points = [200, 500, 1000, 3000]

input_dirs = [
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/General',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Mri',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Ultrasound',
    '/mloscratch/homes/tagemoua/MultiMeditron/processed_data/Chest_X-ray',
]

best_accuracy_array = []

for n in num_data_points:
    output_dir = f'/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_{n}'

    # Dataset preparation
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    print(new_data)

    data = datasets.ImageFolder(root=output_dir, transform=transform)
    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(data))
    val_size = len(data) - train_size
    train_data, val_data = torch.utils.data.random_split(data, [train_size, val_size])

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

    # Model preparation
    model = models.resnet50(pretrained=True)
    num_classes = len(data.classes)
    model.fc = nn.Linear(model.fc.in_features, num_classes)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training the model with early stopping based on loss and accuracy
    num_epochs = 10
    best_accuracy = 0.0
    prev_loss = float('inf')  # Initialize with a high value

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Calculate accuracy for this batch
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Compute epoch loss and accuracy
        epoch_loss = running_loss / len(train_loader)
        epoch_accuracy = 100 * correct / total

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

        

        # Validation step
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)

                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()


        val_accuracy = 100 * correct / total
        print(f"Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_accuracy:.2f}%")

        if epoch > 0 and val_loss > prev_loss and val_accuracy <= best_accuracy:
            print(f"Stopping early at epoch {epoch+1} due to no improvement in accuracy and an increase in loss.")
            break

        best_accuracy = max(best_accuracy, val_accuracy)
        prev_loss = val_loss

    # Save the trained model
    torch.save(model.state_dict(), f'modality_classifier_{n}.pth')


    best_accuracy_array.append(best_accuracy)
    print(best_accuracy_array)



/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/runai-home/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 349MB/s]


Epoch [1/10], Loss: 0.3789, Accuracy: 87.32%
Validation Loss: 108.3030, Accuracy: 55.00%
Epoch [2/10], Loss: 0.1732, Accuracy: 94.52%
Validation Loss: 1.0523, Accuracy: 73.12%
Epoch [3/10], Loss: 0.0590, Accuracy: 97.81%
Validation Loss: 0.6282, Accuracy: 85.00%
Epoch [4/10], Loss: 0.1038, Accuracy: 97.18%
Validation Loss: 1.7421, Accuracy: 83.12%
Stopping early at epoch 4 due to no improvement in accuracy and an increase in loss.
[85.0]
/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500
Epoch [1/10], Loss: 0.3251, Accuracy: 91.00%
Validation Loss: 0.3361, Accuracy: 88.29%
Epoch [2/10], Loss: 0.0924, Accuracy: 97.00%
Validation Loss: 0.0214, Accuracy: 99.14%
Epoch [3/10], Loss: 0.0795, Accuracy: 97.64%
Validation Loss: 3.0120, Accuracy: 76.00%
Stopping early at epoch 3 due to no improvement in accuracy and an increase in loss.
[85.0, 99.14285714285714]
/mloscratch/homes/tagemoua/MultiMeditron/processed_data_subset_500
Epoch [1/10], Loss: 0.1824, Accuracy: 93.88%
Validat