In [30]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# ========================================================
# 2. Load and Clean CSV Data
# ========================================================
train_csv_path = 'train.csv'
test_csv_path = 'test.csv'

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Add '.jpg' extension to md5hash
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the file path (e.g. "acne/xyz.jpg")
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

# Replace ddi_scale with fitzpatrick_centaur
train_df['fitzpatrick_scale'] = train_df['fitzpatrick_centaur']
test_df['fitzpatrick_scale'] = test_df['fitzpatrick_centaur']
train_df.drop(columns=['ddi_scale'], inplace=True)
test_df.drop(columns=['ddi_scale'], inplace=True)

# Remove rows with wrongly labelled data
train_df = train_df[train_df['qc'] != '3 Wrongly labelled']
test_df = test_df[test_df['qc'] != '3 Wrongly labelled']

# Encode the label column numerically
label_encoder = LabelEncoder()
train_df['label_numerical'] = label_encoder.fit_transform(train_df['label'])
label_mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

# Encode partition labels if needed
train_df['nine_partition_numerical'] = label_encoder.fit_transform(train_df['nine_partition_label'])
train_df['three_partition_numerical'] = label_encoder.fit_transform(train_df['three_partition_label'])
# Store label mapping for later use


# Drop original label columns (optional)
train_df.drop(['label', 'three_partition_label', 'nine_partition_label'], axis=1, inplace=True)

print("Train DataFrame after cleaning:\n", train_df.head())

# ========================================================
# 3. Build the Dataset in Memory
# ========================================================
# We'll load each image, resize it to 150x150, and store it in a NumPy array.
base_image_dir = './train/train'
image_size = (150, 150)

all_images = []
all_labels = []

for idx, row in train_df.iterrows():
    file_path = os.path.join(base_image_dir, row['file_path'])
    
    # Load and resize image
    if os.path.exists(file_path):
        with Image.open(file_path) as img:
            img = img.resize(image_size)
            # Convert to RGB if not already
            img = img.convert('RGB')
            img_array = np.array(img, dtype=np.float32)
            # Normalize pixel values to [0,1] (optional)
            img_array /= 255.0
            
            all_images.append(img_array)
            all_labels.append(row['label_numerical'])
    else:
        print(f"Warning: File not found {file_path}")

# Convert lists to NumPy arrays
X = np.array(all_images, dtype=np.float32)
y = np.array(all_labels, dtype=np.int32)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# ========================================================
# 4. Split into Training and Validation Sets
# ========================================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ========================================================
# 5. Build a Simple CNN Model
# ========================================================
num_classes = len(np.unique(y))  # Number of distinct labels

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',  # Because y is integer-labeled
    metrics=['accuracy']
)

model.summary()

# ========================================================
# 6. Train and Save the Model
# ========================================================
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=40,
    batch_size=15
)

model.save('skin_condition_model.h5')
print("Model saved as 'skin_condition_model.h5'")

Train DataFrame after cleaning:
                                 md5hash  fitzpatrick_scale  \
0  fd06d13de341cc75ad679916c5d7e6a6.jpg                  4   
1  a4bb4e5206c4e89a303f470576fc5253.jpg                  1   
2  c94ce27e389f96bda998e7c3fa5c4a2e.jpg                  5   
3  ebcf2b50dd943c700d4e2b586fcd4425.jpg                  3   
4  c77d6c895f05fea73a8f3704307036c0.jpg                  1   

   fitzpatrick_centaur            qc  \
0                    4           NaN   
1                    1           NaN   
2                    5  1 Diagnostic   
3                    3           NaN   
4                    1           NaN   

                                           file_path  label_numerical  \
0  prurigo-nodularis/fd06d13de341cc75ad679916c5d7...               16   
1  basal-cell-carcinoma-morpheiform/a4bb4e5206c4e...                4   
2        keloid/c94ce27e389f96bda998e7c3fa5c4a2e.jpg               12   
3  basal-cell-carcinoma/ebcf2b50dd943c700d4e2b586...         

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 196ms/step - accuracy: 0.1211 - loss: 2.9241 - val_accuracy: 0.1416 - val_loss: 2.8648
Epoch 2/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 242ms/step - accuracy: 0.1222 - loss: 2.8821 - val_accuracy: 0.1399 - val_loss: 2.8489
Epoch 3/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 244ms/step - accuracy: 0.1370 - loss: 2.8434 - val_accuracy: 0.1416 - val_loss: 2.8154
Epoch 4/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 177ms/step - accuracy: 0.1481 - loss: 2.8284 - val_accuracy: 0.1608 - val_loss: 2.7973
Epoch 5/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 231ms/step - accuracy: 0.1547 - loss: 2.7892 - val_accuracy: 0.1661 - val_loss: 2.7214
Epoch 6/40
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 233ms/step - accuracy: 0.1574 - loss: 2.7215 - val_accuracy: 0.1853 - val_loss: 2.6863
Epoch 7/40



Model saved as 'skin_condition_model.h5'


In [31]:
from tensorflow.keras.models import load_model

test_csv_path = 'test.csv'
test_df = pd.read_csv(test_csv_path)

# Ensure the test file names match the directory structure
test_df['file_path'] = test_df['md5hash'].astype(str) + '.jpg'  # Remove label-based path

# Define test image directory
base_test_dir = './test/test'
image_size = (150, 150)

# Load images into memory
test_images = []
test_filenames = []

for idx, row in test_df.iterrows():
    file_path = os.path.join(base_test_dir, row['file_path'])
    
    if os.path.exists(file_path):
        with Image.open(file_path) as img:
            img = img.resize(image_size)
            img = img.convert('RGB')  # Ensure RGB format
            img_array = np.array(img, dtype=np.float32) / 255.0  # Normalize pixel values
            
            test_images.append(img_array)
            test_filenames.append(row['file_path'])  # Store filename for reference
    
    else:
        print(f"Warning: Test file not found {file_path}")

# Convert list to NumPy array
X_test = np.array(test_images, dtype=np.float32)

print(f"Loaded {X_test.shape[0]} test images.")

# ========================================================
# Load Model and Make Predictions
# ========================================================
model = load_model('skin_condition_model.h5')

predictions = model.predict(X_test)

# Debug: Print unique labels in predictions
unique_predicted_labels = np.unique(np.argmax(predictions, axis=1))
print("Unique Predicted Labels:", unique_predicted_labels)
print("Label Mapping Keys:", label_mapping.keys())

# Convert numerical predictions back to their original disease names
predicted_labels = [label_mapping.get(label, "Unknown") for label in np.argmax(predictions, axis=1)]

# Store results in DataFrame
submission_df = pd.DataFrame({
    'md5hash': test_df['md5hash'],
    'label': predicted_labels
})

# Save predictions to CSV
submission_df.to_csv('predictions.csv', index=False)
print("Predictions saved to 'predictions.csv'")



Loaded 1227 test images.
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step
Unique Predicted Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
Label Mapping Keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
Predictions saved to 'predictions.csv'


In [23]:
print("Label Mapping:", label_mapping)

Label Mapping: {0: 'acne', 1: 'acne-vulgaris', 2: 'actinic-keratosis', 3: 'basal-cell-carcinoma', 4: 'basal-cell-carcinoma-morpheiform', 5: 'dermatofibroma', 6: 'dermatomyositis', 7: 'dyshidrotic-eczema', 8: 'eczema', 9: 'epidermal-nevus', 10: 'folliculitis', 11: 'kaposi-sarcoma', 12: 'keloid', 13: 'malignant-melanoma', 14: 'melanoma', 15: 'mycosis-fungoides', 16: 'prurigo-nodularis', 17: 'pyogenic-granuloma', 18: 'seborrheic-keratosis', 19: 'squamous-cell-carcinoma', 20: 'superficial-spreading-melanoma-ssm'}
