In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Simple Starter Code for the AJL Competition

@Cindy Deng

---


Hi! This starter code is designed to help you get familiar with basic Kaggle operations and guide you through the basic workflow of a machine learning project. 

The code outlines essential steps including data loading, preprocessing, model building, training, and generating predictions. Each section serves as a foundation, but there are many ways to enhance each step to improve your final model's accuracy. Feel free to experiment with different data augmentation techniques, model architectures, and tuning methods to optimize your final results! Some amazing tutorials are available through your AI Studio course in Canvas / in the 'Resource' section of this Kaggle competition.

Good luck and have fun!

---

## Note - About file path

You could use the cell above to print the names of the file directories and get the following directories:

```
/kaggle/input/bttai-ajl-2025/sample_submission.csv
/kaggle/input/bttai-ajl-2025/train.csv
/kaggle/input/bttai-ajl-2025/test.csv
/kaggle/input/bttai-ajl-2025/test/test/e0374ae6c1362ff183cfba28ded5421b.jpg
/kaggle/input/bttai-ajl-2025/test/test/437159c605260bdd079af230566af291.jpg
...
...
/kaggle/input/bttai-ajl-2025/train/train/dermatomyositis/11271bdf2598afdd4260db3125e1f6a5.jpg
/kaggle/input/bttai-ajl-2025/train/train/dermatomyositis/732819951dcf2b53d15ea7b8bb123b71.jpg
/kaggle/input/bttai-ajl-2025/train/train/dermatomyositis/6dcc7a8abb5e1c6e670101f4b6231246.jpg
/kaggle/input/bttai-ajl-2025/train/train/dermatomyositis/e63c3b3f0ab8905e204fe467cc7411f9.jpg
...
...
```



## 1. Import Necessary Libraries

In [1]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import models, layers, optimizers

from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.applications.densenet import preprocess_input

# Explanation:
# - pandas and numpy: for data manipulation
# - sklearn: for splitting data and encoding labels
# - tensorflow.keras: for building and training the neural network

## 2. Load Data

Make sure to verify the file paths if you're running on a different platform.

In [None]:
# 2. Load Data
train_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/test.csv')

# Add .jpg extension to md5hash column to reference the file_name
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'

# Combine label and md5hash to form the correct path
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

In [None]:
# Check the first few rows to understand the structure
print(train_df.head())

## 3. Data Preprocessing


This section demonstrates basic preprocessing techniques. To enhance data quality and model performance, consider incorporating more advanced preprocessing methods.

For further guidance, feel free to take a look at the [Image Preprocessing tutorial](https://colab.research.google.com/drive/1-ItNcRMbZBE6BCwPT-wD8m3YmHqwHxme?usp=sharing)  available in the 'Resources' section of this Kaggle competition.


In [None]:
# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

# Split the data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Define image data generators for training and validation
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                  )
val_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Define the directory paths
train_dir = '/kaggle/input/bttai-ajl-2025/train/train/'

In [None]:
def create_generator(dataframe, directory, batch_size=16, target_size=(224, 224)):
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    # Fill in the correct flow_from_dataframe parameters
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',  # Use combined path
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False  # Disable strict filename validation
    )
    return generator

train_generator = create_generator(train_data, train_dir)
val_generator = create_generator(val_data, train_dir)

In [None]:
base_model = DenseNet201(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Initially freeze the whole base model
for layer in base_model.layers[:-100]:  # Keep first layers frozen
    layer.trainable = False

# Add custom layers for your classification task
model = models.Sequential([
    base_model,  
    layers.GlobalAveragePooling2D(),  

    # Output layer for classification (21 classes)
    layers.Dense(21, activation='softmax')  # Removed extra Lambda layer
])


# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Summary of the model
model.summary()

## 5. Train the Model


In [None]:
# Compute class weight to ensure model doesn't favor classes with greater frequency in training dataframe
from sklearn.utils.class_weight import compute_class_weight

# Get unique class labels
class_labels = np.unique(train_data['encoded_label'])

# Compute class weights (balanced approach)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=class_labels,
    y=train_data['encoded_label']
)

# Convert to dictionary for model training
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
print("Class Weights:", class_weights_dict)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',  # Watches validation loss
    factor=0.5,          # Reduce LR by half when triggered
    patience=3,          # Wait 3 epochs before reducing
    min_lr=1e-6          # Minimum LR to prevent zero learning
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',   # Watches validation loss
    patience=3,           # Stops if no improvement after 5 epochs
    restore_best_weights=True  # Keeps the best model weights
)

# Train the model with callbacks
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=30,
    class_weight=class_weights_dict,  # Class balancing
    callbacks=[lr_scheduler, early_stopping]  # Learning rate scheduler + early stopping
)

In [None]:
# Train the model with callbacks
history = model.fit(
    train_generator,tgvfrr
    validation_data=val_generator,
    epochs=25,
    class_weight=class_weights_dict,  # Class balancing
    callbacks=[lr_scheduler, early_stopping]  # Learning rate scheduler + early stopping   

## 6. Make Predictions on Test Data

In [None]:
# 6. Make Predictions on Test Data
def preprocess_test_data(test_df, directory):
    """
    Template for loading and preprocessing test images.
    """
    test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_df,
        directory=directory,
        x_col='md5hash',  # Use combined path
        y_col=None,
        target_size=(224, 224),
        batch_size=16,
        class_mode=None,
        shuffle = False,
        validate_filenames=True  # Disable strict filename validation
    )
    return test_generator

In [None]:
# Load test data
test_dir = '/kaggle/input/bttai-ajl-2025/test/test/'
test_generator = preprocess_test_data(test_df, test_dir)

## 7. Generate Predictions

In [None]:
# Make predictions
predictions = model.predict(test_generator)

# Decode the predicted class labels (get class indices)
predicted_classes = np.argmax(predictions, axis=1)

# Convert class indices to actual skin condition names
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Remove '.jpg' from md5hash column
test_df['md5hash'] = test_df['md5hash'].str.replace('.jpg', '', regex=False)

# Create a new DataFrame with only the required columns
submission_df = test_df[['md5hash']].copy()
submission_df['predicted_label'] = predicted_labels

# Save the processed predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)