In [1]:
from torchvision import models, transforms

import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from sklearn.metrics import pairwise_distances


In [2]:
# Load a pre-trained model (e.g., ResNet18)
model = models.resnet18(pretrained=True)
model = torch.nn.Sequential(*list(model.children())[:-1])  # Remove the final classification layer
model.eval()



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [3]:
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image


In [4]:
# Transform for input images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


In [5]:
def calculate_diversity(features, selected_indices):
    """
    Calculate the diversity of the selected subset of features.

    Parameters:
    features (numpy.ndarray): Feature vectors of all images.
    selected_indices (list of int): Indices of the selected images.

    Returns:
    float: Average pairwise distance of the selected subset.
    """
    # Extract features of the selected images
    selected_features = features[selected_indices]
    
    # Compute pairwise distances between selected features
    distances = pairwise_distances(selected_features)
    
    # Calculate average pairwise distance (excluding the diagonal)
    avg_distance = np.mean(distances[np.triu_indices(len(distances), k=1)])
    
    return avg_distance


In [6]:
def K_greedy(model,folder_path, seed):
    image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    image_files = [file for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    dataset = ImageDataset(image_paths, transform=transform)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    
        # Extract features
    features = []
    with torch.no_grad():
        for images in dataloader:
            feature = model(images).squeeze()
            features.append(feature.cpu().numpy())
    features = np.vstack(features)
    #K-Greedy
    np.random.seed(seed) 
    
    selected_indices = [np.random.choice(len(features))]
    distances = pairwise_distances(features, features[selected_indices])

    for _ in range(199):  # We already have 1 selected, so we need 199 more
        min_distances = np.min(distances, axis=1)
        next_index = np.argmax(min_distances)
        selected_indices.append(next_index)
        distances = np.minimum(distances, pairwise_distances(features, features[[next_index]]))

    # The selected_indices now contains the indices of the 200 most informative images
    selected_images = [image_files[i] for i in selected_indices]
    avg_dist=calculate_diversity(features, selected_indices)
    
  
    return avg_dist, selected_indices, selected_images

    

In [15]:
def Random_select(model,folder_path, seed):
    image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    image_files = [file for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
    dataset = ImageDataset(image_paths, transform=transform)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
    
        # Extract features
    features = []
    with torch.no_grad():
        for images in dataloader:
            feature = model(images).squeeze()
            features.append(feature.cpu().numpy())
    features = np.vstack(features)
    
    np.random.seed(seed) 
    num_samples=200
    selected_indices = np.random.choice(np.arange(len(features)), size=num_samples, replace=False)    
    # The selected_indices now contains the indices of the 200 most informative images
    
    avg_dist=calculate_diversity(features, selected_indices)
      
    return avg_dist


In [16]:

# Define the folder path all normal
folder_path_all_norm = '/users/PCS0218/nonlinearity114/CS7200_SP2024_Project_G01/Notebooks/Data/All/NORMAL'

#normal_all=K_greedy(model, folder_path_all_norm,42)



In [11]:
normal_all[0]

18.793316

In [17]:
dist_K=[]
dist_rand=[]
for i in range(5):
    print('round', i)
    d=K_greedy(model,folder_path_all_norm, i)
    r=Random_select(model,folder_path_all_norm, i )
    
    dist_K.append(d[0])
    dist_rand.append(r)

round 0
round 1
round 2
round 3
round 4


In [19]:
dist_K

[18.784498, 18.798502, 18.873312, 18.797476, 18.806515]

In [20]:
dist_rand

[15.780817, 15.855055, 16.060263, 15.734768, 15.721038]

In [22]:
from scipy import stats


In [23]:
t_statistic, p_value = stats.ttest_ind(dist_K, dist_rand)

print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: 46.580831897414555
P-value: 4.986617015625793e-11


# Select for Pneumonia Class

In [14]:
folder_path = '/users/PCS0218/nonlinearity114/CS7200_SP2024_Project_G01/Notebooks/Data/All/PNEUMONIA'

# Get a list of all image file paths
image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
image_files = [file for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
dataset = ImageDataset(image_paths, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

# Extract features
features = []
with torch.no_grad():
    for images in dataloader:
        feature = model(images).squeeze()
        features.append(feature.cpu().numpy())
features = np.vstack(features)

In [15]:
# K-Center Greedy
np.random.seed(42) 
selected_indices = [np.random.choice(len(features))]
distances = pairwise_distances(features, features[selected_indices])

for _ in range(199):  # We already have 1 selected, so we need 199 more
    min_distances = np.min(distances, axis=1)
    next_index = np.argmax(min_distances)
    selected_indices.append(next_index)
    distances = np.minimum(distances, pairwise_distances(features, features[[next_index]]))

# The selected_indices now contains the indices of the 200 most informative images
selected_images_pneum = [image_files[i] for i in selected_indices]


In [16]:
features_pneum=features
selected_indices_pneum=selected_indices

In [17]:
import csv
# Write the filenames to a CSV file
with open('selected_images_pneum.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for filename in selected_images_pneum:
        writer.writerow([filename])

In [18]:
calculate_diversity(features_pneum,selected_indices)

19.582682

# When selected Randomly

## NORMAL  class

In [19]:
folder_path = '/users/PCS0218/nonlinearity114/CS7200_SP2024_Project_G01/Notebooks/Data/Original/NORMAL/NORMAL'

# Get a list of all image file paths
image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
image_files = [file for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
dataset = ImageDataset(image_paths, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [20]:
# Extract features
features = []
with torch.no_grad():
    for images in dataloader:
        feature = model(images).squeeze()
        features.append(feature.cpu().numpy())
features = np.vstack(features)


In [23]:
features_norm_random=features

In [24]:
calculate_diversity(features_norm_random,np.arange(200))

15.817717

In [25]:
folder_path = '/users/PCS0218/nonlinearity114/CS7200_SP2024_Project_G01/Notebooks/Data/Original/PNEUMONIA/PNEUMONIA'

# Get a list of all image file paths
image_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
image_files = [file for file in os.listdir(folder_path) if file.endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
dataset = ImageDataset(image_paths, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

In [26]:
# Extract features
features = []
with torch.no_grad():
    for images in dataloader:
        feature = model(images).squeeze()
        features.append(feature.cpu().numpy())
features = np.vstack(features)

In [27]:
features_pneum_random=features

In [28]:
calculate_diversity(features_pneum_random,np.arange(200))

16.757645