In [29]:
import os 
import pandas as pd

from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.functional as F
import random

from PIL import Image

csv_path = 'E:\Our_project\sample_labels.csv'
image_dir = 'E:\Our_project\images'

random.seed(42)

class MyDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.transform = transform
        unique_labels = set(label for labels in self.data["Finding Labels"] for label in labels.split('|'))
        all_labels = sorted(unique_labels)
        self.label_to_idx = {label: idx for idx, label in enumerate(all_labels)}  # Encode labels

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        image_path = os.path.join(self.image_dir, self.data.iloc[idx, 0])
        image = Image.open(image_path).convert('L')
        
        labels = self.data.iloc[idx]["Finding Labels"].split('|')
        label_tensor = torch.zeros(len(self.label_to_idx), dtype=torch.float)
        for label in labels:
            label_tensor[self.label_to_idx[label]] = 1.0 

        if self.transform:
            image = self.transform(image)

        return image, label_tensor
    

In [30]:
torch.manual_seed(42)
random.seed(42)

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])  # (X-mean)/std
])

data_set = MyDataset(csv_path, image_dir, transform=transform)

train_size = int(0.8 * len(data_set))
val_size = len(data_set) - train_size

train_dataset, val_dataset = random_split(data_set, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [32]:
random.seed(42)
from torchvision import models
num_classes = len(train_dataset[0][1])

dense_model = models.densenet121(pretrained=True)
dense_model.features[0] = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)

dense_model.classifier = nn.Linear(dense_model.classifier.in_features, num_classes)
# for param in dense_model.parameters():
#     param.requires_grad = False

# for param in dense_model.classifier.parameters():
#     param.requires_grad = True  



In [33]:
dense_model.load_state_dict(torch.load("dence_model_3.pth"))

  dense_model.load_state_dict(torch.load("dence_model_3.pth"))


<All keys matched successfully>

In [34]:
import numpy as np
total_sum = torch.zeros(num_classes)
for i in range(len(train_dataset)):
    total_sum += np.array(train_dataset[i][1])

  total_sum += np.array(train_dataset[i][1])


In [9]:
samples = pd.read_csv("E:\Our_project\sample_labels.csv")

total_samples = len(samples)
class_weights = torch.tensor(total_samples / (num_classes * total_sum), dtype=torch.float32)
loss_fn = nn.BCEWithLogitsLoss (pos_weight=class_weights)  
optimizer = torch.optim.Adam(dense_model.parameters(), lr=0.01) 

  class_weights = torch.tensor(total_samples / (num_classes * total_sum), dtype=torch.float32)


In [10]:
# from sklearn.metrics import f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            # labels = F.one_hot(labels, num_classes=num_classes).float()

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            preds = (torch.sigmoid(outputs) > 0.3).float()

            correct += (preds == labels).all(dim=1).sum().item()

        train_acc = correct / len(train_loader.dataset)
        train_loss = running_loss / len(train_loader)

        model.eval()
        val_loss = 0.0
        val_correct = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, preds = torch.max(outputs, 1)

                val_correct += (preds == labels.argmax(dim=1)).sum().item()

        val_acc = val_correct / len(val_loader.dataset)
        val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return model

In [12]:
train_model(dense_model, train_loader, val_loader, loss_fn, optimizer, epochs=2)

Epoch 1/2: Train Loss: 0.3649, Train Acc: 0.2101, Val Loss: 4.0414, Val Acc: 0.0205
Epoch 2/2: Train Loss: 0.3315, Train Acc: 0.2360, Val Loss: 0.3028, Val Acc: 0.0125


DenseNet(
  (features): Sequential(
    (conv0): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [13]:
torch.save(dense_model.state_dict(), "dence_model_3.pth")

In [49]:
prob = dense_model(train_dataset[939][0].unsqueeze(0)).sigmoid()
preds = (torch.sigmoid(prob) > 0.3).float()
preds

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [51]:
correct = (preds == train_dataset[939][1])
correct

tensor([[False, False, False, False, False, False,  True, False, False,  True,
         False, False, False, False, False]])

In [10]:
dense_model.eval()
val_loss = 0.0
correct = 0
with torch.no_grad():
    for images, labels in val_loader:
        outputs = dense_model(images)
        loss = loss_fn(outputs, labels)
        val_loss += loss.item()
        _, preds = torch.max(outputs, 1)

        correct += (preds == labels.argmax(dim=1)).sum().item()

val_acc = correct / len(val_loader.dataset)
val_loss = val_loss / len(val_loader)

In [30]:
train_dataset.dataset.label_to_idx

{'Atelectasis': 0,
 'Cardiomegaly': 1,
 'Consolidation': 2,
 'Edema': 3,
 'Effusion': 4,
 'Emphysema': 5,
 'Fibrosis': 6,
 'Hernia': 7,
 'Infiltration': 8,
 'Mass': 9,
 'No Finding': 10,
 'Nodule': 11,
 'Pleural_Thickening': 12,
 'Pneumonia': 13,
 'Pneumothorax': 14}

In [35]:
total_sum

tensor([ 410.,  116.,  176.,   86.,  527.,  101.,   66.,   12.,  775.,  233.,
        2408.,  255.,  139.,   47.,  222.])

In [90]:
import os
import random
import pandas as pd

# Paths
data_dir = "E:\Our_project\images" 
csv_path = "E:\Our_project\sample_labels.csv" 
output_csv = "filtered_data.csv" 

df = pd.read_csv(csv_path)
df = df[['Image Index', 'Finding Labels']] 

reduction_factors = {
    "No Finding": 0.3,
    "Infiltration": 0.5,
    "Effusion": 0.6,
    "Atelectasis": 0.7, 
    "Cardiomegaly": 0.8, 
}

class_images = {label: [] for label in reduction_factors.keys()}
remaining_images = set()

for _, row in df.iterrows():
    image_name, labels = row['Image Index'], row['Finding Labels']
    for label in labels.split('|'):
        if label in reduction_factors:
            class_images[label].append((image_name, labels)) 

for label, images in class_images.items():
    keep_count = int(len(images) * reduction_factors.get(label, 1.0)) 
    images_to_keep = random.sample(images, keep_count)
    remaining_images.update(images_to_keep)

all_images = set(df['Image Index'])  # Convert to set
images_to_delete = all_images - {img[0] for img in remaining_images}  # Compute difference

# Perform deletion
for img_name in images_to_delete:
    img_path = os.path.join(data_dir, img_name[0])
    if os.path.exists(img_path):
        os.remove(img_path)  # ✅ Only deletes excess images

# Save the remaining images with labels as CSV
filtered_df = pd.DataFrame(list(remaining_images), columns=["Image Index", "Finding Labels"])
filtered_df.to_csv(output_csv, index=False)

print(f"Dataset reduced. Remaining images with labels saved in {output_csv}.")

Dataset reduced. Remaining images with labels saved in filtered_data.csv.


In [77]:
len(set(df['Image Index']) - {img[0] for img in remaining_images})

3558

In [84]:
df['Image Index']

0       00000013_005.png
1       00000013_026.png
2       00000017_001.png
3       00000030_001.png
4       00000032_001.png
              ...       
5601    00030712_000.png
5602    00030786_005.png
5603    00030789_000.png
5604    00030792_000.png
5605    00030797_000.png
Name: Image Index, Length: 5606, dtype: object