In [3]:
import cv2
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
#import DNN model
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
from tqdm import tqdm

# random
torch.manual_seed(42)
np.random.seed(42)

# Load data
file_path = r'C:\Users\user\Desktop\folder\cancer-classification-main\Folds.csv'
data = pd.read_csv(file_path)

# derive 'label' from the 'filename' column
data['label'] = data['filename'].apply(lambda x: 0 if 'benign' in x else 1)
# filter data for magnification(mag) == 40
data = data[data['mag'] == 40]

#prepare dataset
#parameters
target_size = (224, 224) #target size for DNN input
# Define the base path for images - data is in archive/BreaKHis_v1 folder (nested structure)
base_image_path = r'C:\Users\user\Desktop\folder\cancer-classification-main\archive\BreaKHis_v1'

#Dataset class for train and test
class BreastCancerDataset(Dataset):
    def __init__(self, dataframe, target_size, base_path):
        self.dataframe = dataframe
        self.target_size = target_size
        self.base_path = base_path

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        relative_img_path = row['filename']
        # Construct the full image path and normalize path separators
        img_path = os.path.join(self.base_path, relative_img_path)
        img_path = os.path.normpath(img_path)  # Convert forward slashes to backslashes on Windows
        label = row['label']
    
        #load image
        img = cv2.imread(img_path)
        if img is None:
            # Try to find the image with a debug print
            print(f"Debug: Looking for image at: {img_path}")
            print(f"Debug: File exists: {os.path.exists(img_path)}")
            if os.path.exists(os.path.dirname(img_path)):
                print(f"Debug: Directory exists, files in directory: {os.listdir(os.path.dirname(img_path))[:5]}")
            raise FileNotFoundError(f"Image not found: {img_path}")
        
        #Resize and preprocess
        img = cv2.resize(img, self.target_size)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # convert to RGB
        img = img / 255.0 # Normalize to [0, 1]
        img = np.transpose(img, (2, 0, 1)) # Convert to (C, H, W)

        #convert to tensor and flatten for DNN
        img_tensor = torch.tensor(img, dtype=torch.float32)
        img_tensor = img_tensor.reshape(-1)  # Flatten for DNN input
        label_tensor = torch.tensor(label, dtype=torch.long)

        return img_tensor, label_tensor
    
# Split data into train and test based on 'grp'
train_df = data[data['grp'] == 'train']
test_df = data[data['grp'] == 'test']

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Class distribution - Benign: {sum(data['label'] == 0)}, Malignant: {sum(data['label'] == 1)}")

# Create train and test datasets
train_dataset = BreastCancerDataset(train_df, target_size, base_image_path)
test_dataset = BreastCancerDataset(test_df, target_size, base_image_path)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#Define DNN model
class DNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DNNModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.fc3 = nn.Linear(1024, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        self.fc5 = nn.Linear(256, num_classes)
        
        self.relu = nn.LeakyReLU(0.1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        x = self.dropout(self.relu(self.bn4(self.fc4(x))))
        x = self.fc5(x)
        return x
    
# Initialize the model, loss function, and optimizer
input_size = 3 * target_size[0] * target_size[1]  # Flattened image size
num_classes = 2  # Binary classification: benign (0) vs malignant (1)
print(f'Input size: {input_size}')
print(f'Number of classes: {num_classes}')
model = DNNModel(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
model.to(device)

#Training function
def train_model(model, train_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            images, labels = images.to(device), labels.to(device)

            #zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

epochs = 10
train_model(model, train_loader, criterion, optimizer, epochs)


# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

# Evaluate the model
y_test, y_pred = evaluate_model(model, test_loader)

target_names = ['Benign', 'Malignant'] # define label name (fixed typo)
print(classification_report(y_test, y_pred, target_names=target_names))

# Additional analysis and visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Benign', 'Malignant'], 
            yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix - DNN Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate additional metrics
accuracy = accuracy_score(y_test, y_pred)
precision_benign = precision_score(y_test, y_pred, pos_label=0)
precision_malignant = precision_score(y_test, y_pred, pos_label=1)
recall_benign = recall_score(y_test, y_pred, pos_label=0)
recall_malignant = recall_score(y_test, y_pred, pos_label=1)
f1_benign = f1_score(y_test, y_pred, pos_label=0)
f1_malignant = f1_score(y_test, y_pred, pos_label=1)

print("=== DNN 模型詳細效能指標 ===")
print(f"整體準確率: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\n良性腫瘤:")
print(f"精確率: {precision_benign:.4f} ({precision_benign*100:.2f}%)")
print(f"召回率: {recall_benign:.4f} ({recall_benign*100:.2f}%)")
print(f"F1-score: {f1_benign:.4f}")
print(f"\n惡性腫瘤:")
print(f"精確率: {precision_malignant:.4f} ({precision_malignant*100:.2f}%)")
print(f"召回率: {recall_malignant:.4f} ({recall_malignant*100:.2f}%)")
print(f"F1-score: {f1_malignant:.4f}")

# Medical significance
print(f"\n=== 醫療意義分析 ===")
print(f"假陰性(漏診惡性): {cm[1,0]} 例 ({cm[1,0]/np.sum(cm[1,:])*100:.2f}%)")
print(f"假陽性(誤診良性為惡性): {cm[0,1]} 例 ({cm[0,1]/np.sum(cm[0,:])*100:.2f}%)")
print(f"真陽性(正確診斷惡性): {cm[1,1]} 例")
print(f"真陰性(正確診斷良性): {cm[0,0]} 例")

Training samples: 6466
Test samples: 3509
Class distribution - Benign: 3125, Malignant: 6850
Input size: 150528
Number of classes: 2
Using device: cuda
Using device: cuda


Epoch 1/10:   3%|▎         | 7/203 [00:50<23:26,  7.18s/it]



KeyboardInterrupt: 