In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.image import load_img, img_to_array

# Load data

In [27]:
image_folder = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_images_part_1/"
metadata_path = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_metadata.csv"

metadata = pd.read_csv(metadata_path)
metadata['image_path'] = metadata['image_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))
metadata_part1 = metadata[(metadata['image_id'] > 'ISIC_0024305') & (metadata['image_id'] < 'ISIC_0029306')]

# Load images and preprocess them
image_size = (64, 64)  

def load_and_preprocess_image(image_path):
    img = load_img(image_path, target_size=image_size)
    img = img_to_array(img) / 255.0  # Normalize to [0,1]
    return img

# Load all images into a list and convert to a numpy array
images_part1 = np.array([load_and_preprocess_image(img_path) for img_path in metadata_part1['image_path']])
print(images_part1.shape) 

(5000, 224, 224, 3)


In [28]:
labels_part1 = metadata_part1['dx']
# make dictionary to assign a unique integer index to each unique label 
# string to numerical
label_mapping = {label: idx for idx, label in enumerate(labels_part1.unique())}
# replaces each original label (from dx) with its corresponding integer value
metadata_part1.loc[:, 'label'] = metadata_part1['dx'].map(label_mapping)
# convert integer labels into a binary (one-hot) matrix 
labels_encoded = to_categorical(metadata_part1['label'])
print(metadata_part1['label'].head(10))
print(metadata_part1['dx'].unique())
print(metadata_part1['label'].value_counts())

0     0
1     0
2     0
3     0
5     0
6     0
7     0
8     0
9     0
10    0
Name: label, dtype: int64
['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
label
1    3431
0     564
3     435
5     266
6     183
4      65
2      56
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metadata_part1.loc[:, 'label'] = metadata_part1['dx'].map(label_mapping)


In [18]:
# Split data into training and testing sets
# 80% training, 20% testing
# x = images, y = labels
X_train, X_test, y_train, y_test = train_test_split(images_part1, labels_encoded, test_size=0.2, random_state=42)


# Define a simple baseline model (CNN)

In [23]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(image_size[0], image_size[1], 3)),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(label_mapping), activation='softmax')  # Output layer for classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [25]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - accuracy: 0.6688 - loss: 1.1748 - val_accuracy: 0.6650 - val_loss: 0.9739
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.7091 - loss: 0.8875 - val_accuracy: 0.6730 - val_loss: 0.8799
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.7192 - loss: 0.8374 - val_accuracy: 0.6690 - val_loss: 0.9108
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.7046 - loss: 0.8056 - val_accuracy: 0.6880 - val_loss: 0.8548
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.7125 - loss: 0.7919 - val_accuracy: 0.6890 - val_loss: 0.8411
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.7341 - loss: 0.7426 - val_accuracy: 0.6910 - val_loss: 0.8218
Epoch 7/10
[1m125/125

<keras.src.callbacks.history.History at 0x35c607910>

In [26]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7163 - loss: 0.7635
Test Accuracy: 0.7200


In [None]:
# Plot training & validation accuracy values
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper left')

plt.show()

# Simple Model using ResNet 

In [2]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [3]:
# Load the metadata
metadata_path = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_metadata.csv"
metadata = pd.read_csv(metadata_path)

# Create a label mapping from 'dx' column
#label_mapping = {label: idx for idx, label in enumerate(metadata['dx'].unique())}
#metadata['label'] = metadata['dx'].map(label_mapping)

# Check the label distribution
#print(metadata['label'].value_counts())
print(metadata['dx'].value_counts())

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


In [None]:
# Load the image data
# Image directories
image_folder_1 = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_images_part_1/"
image_folder_2 = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_images_part_2/"

# Combine paths of images from both parts
metadata['image_path'] = metadata['image_id'].apply(lambda x: 
    os.path.join(image_folder_1, x + ".jpg") 
    if os.path.exists(os.path.join(image_folder_1, x + ".jpg"))
    else os.path.join(image_folder_2, x + ".jpg")
)

#print(metadata)

# Split into train and test sets (80% train, 20% test)
# stratify parameter ensures that the class distribution (dx) is the same in both sets
train_df, test_df = train_test_split(metadata, stratify=metadata['dx'], test_size=0.2)

print(train_df.shape, test_df.shape)

(8012, 8) (2003, 8)


In [None]:
# Setup the ImageDataGenerator
# Image size for ResNet50 (size of the images ResNet50 was pretrained on)
image_size = (224, 224)

# Create ImageDataGenerator for training and testing
# preprocess_input is a function that ensures all images are scaled and normalized according to the standards required by ResNet50
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Flow images from dataframe
# flow_from_dataframe generates batches of images and their corresponding labels directly from a Pandas DataFrame.
train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    x_col='image_path', 
    y_col='dx', 
    target_size=image_size,
    class_mode= 'categorical',
    batch_size=32
)

test_generator = test_datagen.flow_from_dataframe(
    test_df, 
    x_col='image_path', 
    y_col='dx', 
    target_size=image_size, 
    class_mode= 'categorical',
    batch_size=32
)

Found 8012 validated image filenames belonging to 7 classes.
Found 2003 validated image filenames belonging to 7 classes.
<keras.src.legacy.preprocessing.image.ImageDataGenerator object at 0x31041dca0>


In [19]:
# Setup ResnNet50 model
# Load the pre-trained ResNet50 model, excluding the top classification layer
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the ResNet50 base layers so they won't be trained
for layer in resnet_model.layers:
    layer.trainable = False

# Create a new model on top of ResNet50
model = Sequential([
    resnet_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dense(7, activation='softmax')  # Adjust the number of output classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Check model summary
model.summary()

In [20]:
# Train the model
# Put data into the model
# Train the model
history = model.fit(
    train_generator,
    validation_data=test_generator,
    epochs=5, 
    steps_per_epoch=len(train_generator),
    validation_steps=len(test_generator)
)

Epoch 1/5


  self._warn_if_super_not_called()


[1m145/251[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:29[0m 845ms/step - accuracy: 0.6116 - loss: 1.2295

KeyboardInterrupt: 

In [5]:
# Image directories
image_folder_1 = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_images_part_1/"
image_folder_2 = "/Users/liliane.bader/Documents/AU/DLfVR/project/data/archive/HAM10000_images_part_2/"

# Combine paths of images from both parts
metadata['image_path'] = metadata['image_id'].apply(lambda x: 
    os.path.join(image_folder_1, x + ".jpg") 
    if os.path.exists(os.path.join(image_folder_1, x + ".jpg"))
    else os.path.join(image_folder_2, x + ".jpg")
)

# Split into train and test sets
train_df, test_df = train_test_split(metadata, stratify=metadata['label'], test_size=0.2)

# Image size for ResNet50
image_size = (224, 224)

# Create ImageDataGenerator for training and testing
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Convert integer labels to strings
train_df['label'] = train_df['label'].astype(str)
test_df['label'] = test_df['label'].astype(str)


# Flow images from dataframe
train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    x_col='image_path', 
    y_col='label', 
    target_size=image_size,
    class_mode= 'categorical',
    batch_size=32
)

test_generator = test_datagen.flow_from_dataframe(
    test_df, 
    x_col='image_path', 
    y_col='label', 
    target_size=image_size, 
    class_mode= 'categorical',
    batch_size=32
)


Found 8012 validated image filenames belonging to 7 classes.
Found 2003 validated image filenames belonging to 7 classes.


In [None]:
# Load the pre-trained ResNet50 model, excluding the top classification layer
resnet_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the ResNet50 base layers so they won't be trained
for layer in resnet_model.layers:
    layer.trainable = False

# Create a new model on top of ResNet50
model = Sequential([
    resnet_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dense(7, activation='softmax')  # Adjust the number of output classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Check model summary
model.summary()


In [8]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=test_generator,
    epochs=5, 
    steps_per_epoch=len(train_generator),
    validation_steps=len(test_generator)
)

Epoch 1/10


KeyboardInterrupt: 

In [None]:
# Evaluate the model
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc}")


In [None]:
keras.applications.ResNet50(
    include_top=True,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation="softmax",
    name="resnet50",
)