In [1]:
import os
import pandas as pd

# Define base directory
base_dir = r"D:\8th sem\healthcare\dataset\Segmented"

# Define leukemia subtypes (folder names are the labels)
categories = ["Benign", "Early", "Pre", "Pro"]

# List to store data
data = []

# Iterate through each category folder
for category in categories:
    category_path = os.path.join(base_dir, category)
    
    # Check if folder exists
    if os.path.exists(category_path):
        for file_name in os.listdir(category_path):
            file_path = os.path.join(category_path, file_name)
            
            # Ensure it's an image file
            if os.path.isfile(file_path) and file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                data.append([file_name, category, file_path])  # Store filename, label, and full path

# Create a DataFrame
df = pd.DataFrame(data, columns=["File Name", "Label", "File Path"])

# Save as CSV (optional)
csv_path = os.path.join(base_dir, "leukemia_dataset.csv")
df.to_csv(csv_path, index=False)

# Display DataFrame
print(df.head())


            File Name   Label  \
0  WBC-Benign-001.jpg  Benign   
1  WBC-Benign-002.jpg  Benign   
2  WBC-Benign-003.jpg  Benign   
3  WBC-Benign-004.jpg  Benign   
4  WBC-Benign-005.jpg  Benign   

                                           File Path  
0  D:\8th sem\healthcare\dataset\Segmented\Benign...  
1  D:\8th sem\healthcare\dataset\Segmented\Benign...  
2  D:\8th sem\healthcare\dataset\Segmented\Benign...  
3  D:\8th sem\healthcare\dataset\Segmented\Benign...  
4  D:\8th sem\healthcare\dataset\Segmented\Benign...  


In [3]:
import pandas as pd
df = pd.read_csv("leukemia_dataset.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(df.head(5))

                     File Name   Label  \
0    WBC-Malignant-Pre-863.jpg     Pre   
1           WBC-Benign-135.jpg  Benign   
2    WBC-Malignant-Pre-810.jpg     Pre   
3    WBC-Malignant-Pre-122.jpg     Pre   
4  WBC-Malignant-Early-487.jpg   Early   

                                           File Path  
0  D:\8th sem\healthcare\dataset\Segmented\Pre\WB...  
1  D:\8th sem\healthcare\dataset\Segmented\Benign...  
2  D:\8th sem\healthcare\dataset\Segmented\Pre\WB...  
3  D:\8th sem\healthcare\dataset\Segmented\Pre\WB...  
4  D:\8th sem\healthcare\dataset\Segmented\Early\...  


In [4]:
print(df.shape)

(3256, 3)


In [5]:
import cv2

# Iterate through the DataFrame and print image sizes
for index, row in df.iterrows():
    file_path = row["File Path"]
    
    # Read image using OpenCV
    img = cv2.imread(file_path)
    
    if img is not None:
        height, width, channels = img.shape
        print(f"Image: {row['File Name']} | Width: {width}, Height: {height}, Channels: {channels}")
    else:
        print(f"Could not read image: {row['File Name']}")


Image: WBC-Malignant-Pre-863.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Benign-135.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Pre-810.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Pre-122.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-487.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-670.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-703.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Pre-381.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Pro-146.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Pro-216.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-605.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-503.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-911.jpg | Width: 224, Height: 224, Channels: 3
Image: WBC-Malignant-Early-394.jpg | Width: 224, Height: 22

In [6]:
import cv2

# Set the expected image size
expected_width, expected_height = 224, 224
all_same = True  # Flag to check if all images are the same size

# Iterate through the DataFrame and check image sizes
for index, row in df.iterrows():
    file_path = row["File Path"]
    
    # Read the image
    img = cv2.imread(file_path)
    
    if img is not None:
        height, width, channels = img.shape
        
        # Check if size differs from 224x224
        if width != expected_width or height != expected_height:
            print(f"Different size found: {row['File Name']} | Width: {width}, Height: {height}, Channels: {channels}")
            all_same = False

# If all images are the same size, print confirmation
if all_same:
    print("All images are of size 224x224")


All images are of size 224x224


In [7]:
import pandas as pd
import cv2
import os
import albumentations as A
import numpy as np

# Load the dataset
df = pd.read_csv("leukemia_dataset.csv")

# Define augmentation pipeline
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Rotate(limit=30, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
])

# Create directory for augmented images
augmented_folder = "Augmented_Images"
os.makedirs(augmented_folder, exist_ok=True)

# Create a list to store new augmented image data
augmented_data = []

# Loop through each row in the dataset
for index, row in df.iterrows():
    file_path = row["File Path"]
    label = row["Label"]
    
    # Read the image
    image = cv2.imread(file_path)
    
    if image is None:
        print(f"Skipping {file_path} (not found)")
        continue  # Skip if the image is missing
    
    # Convert BGR to RGB (Albumentations expects RGB format)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Apply augmentations (create 3 augmented versions per image)
    for i in range(3):  
        augmented = transform(image=image)["image"]
        
        # Convert back to BGR before saving
        augmented_bgr = cv2.cvtColor(augmented, cv2.COLOR_RGB2BGR)
        
        # Generate new filename
        base_name = os.path.basename(file_path)
        new_file_name = f"aug_{i}_{base_name}"
        new_file_path = os.path.join(augmented_folder, new_file_name)  # Save to separate folder
        
        # Save augmented image
        cv2.imwrite(new_file_path, augmented_bgr)
        
        # Append new data to list
        augmented_data.append([new_file_name, label, new_file_path])

# Convert to DataFrame
augmented_df = pd.DataFrame(augmented_data, columns=["File Name", "Label", "File Path"])

# Concatenate original and augmented data
final_df = pd.concat([df, augmented_df], ignore_index=True)

# Save the new dataset as CSV
final_df.to_csv("augmented_leukemia_dataset.csv", index=False)

print(f"Augmentation complete! Augmented images saved in '{augmented_folder}/'. New dataset saved as 'augmented_leukemia_dataset.csv'.")

  original_init(self, **validated_kwargs)


Augmentation complete! Augmented images saved in 'Augmented_Images/'. New dataset saved as 'augmented_leukemia_dataset.csv'.


In [8]:
import pandas as pd
df = pd.read_csv("augmented_leukemia_dataset.csv")
print(df.head(5))

            File Name   Label  \
0  WBC-Benign-001.jpg  Benign   
1  WBC-Benign-002.jpg  Benign   
2  WBC-Benign-003.jpg  Benign   
3  WBC-Benign-004.jpg  Benign   
4  WBC-Benign-005.jpg  Benign   

                                           File Path  
0  D:\8th sem\healthcare\dataset\Segmented\Benign...  
1  D:\8th sem\healthcare\dataset\Segmented\Benign...  
2  D:\8th sem\healthcare\dataset\Segmented\Benign...  
3  D:\8th sem\healthcare\dataset\Segmented\Benign...  
4  D:\8th sem\healthcare\dataset\Segmented\Benign...  


In [9]:
import cv2
import os
import pandas as pd

# Define the new directory to save normalized images
normalized_dir = "D:/8th sem/healthcare/dataset/Segmented/Normalized"
os.makedirs(normalized_dir, exist_ok=True)  # Create if doesn't exist

# Function to normalize and save image
def normalize_and_save(image_path):
    img = cv2.imread(image_path)  # Read image
    if img is not None:
        img = img / 255.0  # Normalize to [0,1]
        new_filename = os.path.join(normalized_dir, os.path.basename(image_path))  # New path
        cv2.imwrite(new_filename, (img * 255).astype('uint8'))  # Save normalized image
        return new_filename  # Return new path
    return None

# Apply function to each row and create new column
df["Normalized_Path"] = df["File Path"].apply(normalize_and_save)

# Save updated dataframe
df.to_csv("normalized_leukemia.csv", index=False)

print("Normalization complete! Paths saved in 'Normalized_Path' column.")


Normalization complete! Paths saved in 'Normalized_Path' column.


In [1]:
import pandas as pd
df = pd.read_csv("normalized_leukemia.csv")
print(df.head(5))

            File Name   Label  \
0  WBC-Benign-001.jpg  Benign   
1  WBC-Benign-002.jpg  Benign   
2  WBC-Benign-003.jpg  Benign   
3  WBC-Benign-004.jpg  Benign   
4  WBC-Benign-005.jpg  Benign   

                                           File Path  \
0  D:\8th sem\healthcare\dataset\Segmented\Benign...   
1  D:\8th sem\healthcare\dataset\Segmented\Benign...   
2  D:\8th sem\healthcare\dataset\Segmented\Benign...   
3  D:\8th sem\healthcare\dataset\Segmented\Benign...   
4  D:\8th sem\healthcare\dataset\Segmented\Benign...   

                                     Normalized_Path  
0  D:/8th sem/healthcare/dataset/Segmented/Normal...  
1  D:/8th sem/healthcare/dataset/Segmented/Normal...  
2  D:/8th sem/healthcare/dataset/Segmented/Normal...  
3  D:/8th sem/healthcare/dataset/Segmented/Normal...  
4  D:/8th sem/healthcare/dataset/Segmented/Normal...  


In [2]:
print(df.shape)

(13024, 4)


In [3]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import pandas as pd
import cv2
import numpy as np
from torch.utils.data import DataLoader, Dataset

# Load DenseNet-121 Model (Pretrained)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.densenet121(pretrained=True).features.to(device)
model.eval()

# Define Transform (Normalization)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Custom Dataset
class LeukemiaDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df.iloc[idx]["Normalized_Path"]
        image = cv2.imread(file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224, 224))  # Ensure 224x224
        image = transform(image)  # Apply normalization
        return image

# Load Data
df = pd.read_csv("normalized_leukemia.csv")
dataset = LeukemiaDataset(df)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

# Extract Features in Batches
features = []
with torch.no_grad():
    for batch in dataloader:
        batch = batch.to(device)
        batch_features = model(batch)
        batch_features = torch.mean(batch_features, dim=[2, 3])  # Global Average Pooling
        features.append(batch_features.cpu().numpy())

# Convert to DataFrame and Save
features_array = np.concatenate(features, axis=0)
features_df = pd.DataFrame(features_array)
features_df.to_csv("densenet_features.csv", index=False)

print("Feature extraction complete! Features saved as 'densenet_features.csv'")



Feature extraction complete! Features saved as 'densenet_features.csv'


In [None]:
''''import pandas as pd

# Load feature dataset (skip the first row)
features_df = pd.read_csv("densenet_features.csv", skiprows=1, header=None)

# Load label dataset
labels_df = pd.read_csv("normalized_leukemia.csv")

# Verify row count after correction
print(f"Fixed Features Rows: {len(features_df)}, Labels Rows: {len(labels_df)}")

# Rename feature columns
features_df.columns = [f"Feature_{i}" for i in range(1, 1025)]  # 1024 features

# Ensure row counts now match
assert len(features_df) == len(labels_df), "Mismatch in number of rows even after correction!"

# Add labels to features
features_df["Label"] = labels_df["Label"].values  

# Save the corrected dataset
features_df.to_csv("features_with_labels.csv", index=False)

print("✅ Fixed dataset saved as 'features_with_labels.csv'. Ready for PSO!")'''


In [1]:
import psutil
ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")
print(f"Available RAM: {ram.available / 1e9:.2f} GB")
print(f"Used RAM: {ram.used / 1e9:.2f} GB")
print(f"RAM Usage: {ram.percent}%")


Total RAM: 16.95 GB
Available RAM: 6.41 GB
Used RAM: 10.55 GB
RAM Usage: 62.2%


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from pyswarm import pso

# Load dataset
data = pd.read_csv("features_with_labels.csv")

# Separate features and labels
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Labels

# Step 1: Reduce Features with VarianceThreshold
var_thresh = VarianceThreshold(threshold=0.01)  # Remove low variance features
X = var_thresh.fit_transform(X)
print(f"Features after VarianceThreshold: {X.shape[1]}")

# Step 2: Select Top 300 Features (Reduce Before PSO)
select_k = SelectKBest(score_func=f_classif, k=300)  # Keep top 300 best features
X = select_k.fit_transform(X, y)
print(f"Features after SelectKBest: {X.shape[1]}")

# Split dataset (PSO runs on 30% of training data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, test_size=0.7, random_state=42)

# Define fitness function
def fitness_function(selected_features):
    selected_indices = np.where(selected_features > 0.5)[0]
    if len(selected_indices) == 0:
        return 1  # Avoid division by zero

    # Select only the chosen features
    X_train_selected = X_train_small[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]

    # Train a lightweight classifier (Logistic Regression)
    clf = LogisticRegression(max_iter=200, solver="liblinear")
    clf.fit(X_train_selected, y_train_small)

    # Evaluate accuracy
    accuracy = clf.score(X_test_selected, y_test)
    return 1 - accuracy  # PSO minimizes the function

# Set PSO parameters
num_particles = 5  # Reduce swarm size
num_iterations = 10  # Reduce iterations
num_features = X.shape[1]  # Adjusted after feature reduction

lb = [0] * num_features  # Lower bound (feature off)
ub = [1] * num_features  # Upper bound (feature on)

# Run PSO (expect faster results)
best_solution, _ = pso(fitness_function, lb, ub, swarmsize=num_particles, maxiter=num_iterations)

# Get selected feature indices
selected_feature_indices = np.where(best_solution > 0.5)[0]
print(f"Selected {len(selected_feature_indices)} Features")

# Save selected features
selected_features_df = pd.DataFrame(X[:, selected_feature_indices])
selected_features_df["Label"] = y
selected_features_df.to_csv("selected_features_fast.csv", index=False)

print("Feature selection completed in significantly less time!. Saved as 'selected_features_fast.csv'.")


Features after VarianceThreshold: 526
Features after SelectKBest: 300
Stopping search: maximum iterations reached --> 10
Selected 146 Features
Feature selection completed in significantly less time!. Saved as 'selected_features_fast.csv'.
