In [1]:
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
import torch
import os
from torchvision import transforms
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torch.utils.data import DataLoader

In [3]:
class MultiPartDataset(Dataset):
    def __init__(self, path_to_folders, label_csv, transform):
        self.meta_data = pd.read_csv(label_csv)
        self.path_to_folders = path_to_folders
        self.data = []
        
        dx, _ = pd.factorize(self.meta_data['dx'], sort=True)
        dx_type, _ = pd.factorize(self.meta_data['dx_type'], sort=True)
        age, _ = pd.factorize(self.meta_data['age'], sort=True)
        sex, _ = pd.factorize(self.meta_data['sex'], sort=True)
        for path in self.path_to_folders:
            files = os.listdir(path)

            localization, _ = pd.factorize(self.meta_data['localization'], sort=True)
            for file in sorted(files):
                idx = self.meta_data['image_id'] == file.split('.')[0]
                self.data.append((path, file, [dx_type[idx], age[idx], sex[idx], localization[idx]], dx[idx]))
        
        self.transform = transform
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.data[idx][0], self.data[idx][1])
        image = Image.open(img_name)
        label = self.data[idx][3]
        attributes = np.array(self.data[idx][2])
        attributes = torch.from_numpy(attributes.flatten())

        if self.transform:
            image = self.transform(image)
        
        
        return image, attributes, label[0]

In [4]:
folders = ["/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1", "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_2"]
meta_csv_file_path = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv"
transform = transforms.Compose([
    transforms.Resize((255, 255)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
dataset = MultiPartDataset(folders, meta_csv_file_path, transform)

In [8]:
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

# Initialize the VGG model with pre-trained weights
model = models.vgg16(pretrained=True)
num_features = model.classifier[6].in_features
num_classes = len(torch.unique(torch.tensor(dataset.meta_data['dx'].factorize()[0])))
model.classifier[6] = nn.Linear(num_features, num_classes)
print(num_classes)



7


In [9]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [12]:
from sklearn.model_selection import train_test_split

# Assuming 'dataset' is already defined
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4)

In [14]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    train_total = 0

    for images, attributes, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, attributes, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100 * val_correct / val_total
    print(f'Epoch {epoch+1}, Training Loss: {train_loss/train_total:.4f}, Training Accuracy: {train_accuracy:.2f}%, Validation Loss: {val_loss/val_total:.4f}, Validation Accuracy: {val_accuracy:.2f}%')


Epoch 1, Training Loss: 0.0021, Training Accuracy: 97.84%, Validation Loss: 0.0194, Validation Accuracy: 84.87%
Epoch 2, Training Loss: 0.0029, Training Accuracy: 96.79%, Validation Loss: 0.0158, Validation Accuracy: 86.52%
Epoch 3, Training Loss: 0.0015, Training Accuracy: 98.35%, Validation Loss: 0.0191, Validation Accuracy: 85.42%
Epoch 5, Training Loss: 0.0009, Training Accuracy: 98.99%, Validation Loss: 0.0257, Validation Accuracy: 82.43%
Epoch 6, Training Loss: 0.0017, Training Accuracy: 98.33%, Validation Loss: 0.0221, Validation Accuracy: 84.52%
Epoch 7, Training Loss: 0.0012, Training Accuracy: 98.66%, Validation Loss: 0.0187, Validation Accuracy: 85.37%
Epoch 8, Training Loss: 0.0004, Training Accuracy: 99.70%, Validation Loss: 0.0235, Validation Accuracy: 85.97%
Epoch 9, Training Loss: 0.0001, Training Accuracy: 99.93%, Validation Loss: 0.0231, Validation Accuracy: 87.22%
Epoch 10, Training Loss: 0.0001, Training Accuracy: 99.95%, Validation Loss: 0.0250, Validation Accuracy

In [15]:
# Save the model
model_path = 'vgg_model_skin_cancer.h5'
torch.save(model.state_dict(), model_path)
print(f'Model saved to {model_path}')

Model saved to vgg_model_skin_cancer.h5
