In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, Activation



2025-06-20 01:39:27.792547: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750383568.256285      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750383568.366777      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load CSV file
df = pd.read_csv('/kaggle/input/ai-1904-dpl-302-m-butterfly-image-classification/Training_set.csv')

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Số lớp phân loại
num_classes = df['label_encoded'].nunique()


In [3]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_encoded'], random_state=42)

In [4]:
# ImageDataGenerator with data augmentation for training
train_gen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=45,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# Only rescaling for validation data
val_gen = ImageDataGenerator(rescale=1./255)

# Create train data generator
train_data = train_gen.flow_from_dataframe(
    dataframe=train_df,
    directory='/kaggle/input/ai-1904-dpl-302-m-butterfly-image-classification/train/train',
    x_col='filename',
    y_col='label',
    target_size=(300, 300),
    class_mode='categorical',
    batch_size=32,
    shuffle=True
)

# Create validation data generator
val_data = val_gen.flow_from_dataframe(
    dataframe=val_df,
    directory='/kaggle/input/ai-1904-dpl-302-m-butterfly-image-classification/train/train',
    x_col='filename',
    y_col='label',
    target_size=(300, 300),
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)


Found 4000 validated image filenames belonging to 75 classes.
Found 1000 validated image filenames belonging to 75 classes.


In [5]:
base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=(300, 300, 3))

x = GlobalAveragePooling2D()(base_model.output)
x = Dense(512)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

x = Dense(256)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.3)(x)

x = Dense(128)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.1)(x)

output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=output)

# Compile model
model.compile(optimizer=Adam(learning_rate=1e-4),
              loss='categorical_crossentropy',
              metrics=['accuracy'])


I0000 00:00:1750383602.143640      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1750383602.144389      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb3_notop.h5
[1m43941136/43941136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [6]:

callbacks = [
    EarlyStopping(
        monitor='val_accuracy',             # Theo dõi độ chính xác trên tập validation
        patience=15,                        # Dừng nếu không cải thiện sau 10 epoch
        restore_best_weights=True,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='best_butterfly_model.keras',  # Tên file lưu mô hình tốt nhất
        monitor='val_accuracy',                 # Lưu mô hình khi val_accuracy tốt hơn
        save_best_only=True,
        save_weights_only=False,                # Lưu toàn bộ mô hình (not just weights)
        mode='max',
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_accuracy',                 # Giảm learning rate khi val acc không cải thiện
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )
]
history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=70,
    callbacks=callbacks
)


  self._warn_if_super_not_called()


Epoch 1/70


I0000 00:00:1750383696.888775      98 service.cc:148] XLA service 0x78dbd0001ee0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750383696.890280      98 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1750383696.890302      98 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1750383705.447497      98 cuda_dnn.cc:529] Loaded cuDNN version 90300
E0000 00:00:1750383727.080430      98 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1750383727.237709      98 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
E0000 00:00:1750383727.829005      98 gpu_timer.cc:82] Delay kernel timed out: measured time has sub-optimal accuracy. Th

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - accuracy: 0.0220 - loss: 4.5629
Epoch 1: val_accuracy improved from -inf to 0.01600, saving model to best_butterfly_model.keras
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 1s/step - accuracy: 0.0221 - loss: 4.5620 - val_accuracy: 0.0160 - val_loss: 4.3380 - learning_rate: 1.0000e-04
Epoch 2/70
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 842ms/step - accuracy: 0.0752 - loss: 4.0561
Epoch 2: val_accuracy improved from 0.01600 to 0.03100, saving model to best_butterfly_model.keras
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 895ms/step - accuracy: 0.0754 - loss: 4.0551 - val_accuracy: 0.0310 - val_loss: 4.3530 - learning_rate: 1.0000e-04
Epoch 3/70
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 826ms/step - accuracy: 0.2023 - loss: 3.4944
Epoch 3: val_accuracy improved from 0.03100 to 0.06100, saving model to best_butterfly_mod

In [15]:
from tensorflow.keras.models import load_model

# Load mô hình tốt nhất đã lưu
model = load_model("/kaggle/working/best_butterfly_model.keras")

In [16]:
import os
from tensorflow.keras.preprocessing import image

# Lấy danh sách file ảnh test
test_dir = "/kaggle/input/ai-1904-dpl-302-m-butterfly-image-classification/test/test"
test_files = [f for f in os.listdir(test_dir) if f.endswith(".jpg")]

# Tạo danh sách ảnh đã resize và chuẩn hóa
img_array = []
img_ids = []

for fname in test_files:
    img_path = os.path.join(test_dir, fname)
    img = image.load_img(img_path, target_size=(300, 300))
    img_tensor = image.img_to_array(img) / 255.0
    img_array.append(img_tensor)
    img_ids.append(fname)

img_array = np.array(img_array)


In [17]:
# Dự đoán
predictions = model.predict(img_array)

# Chuyển sang index class
predicted_indices = np.argmax(predictions, axis=1)

# Giải mã lại thành tên loài bướm
predicted_labels = le.inverse_transform(predicted_indices)


[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 332ms/step


In [18]:
# Tạo DataFrame kết quả
results_df = pd.DataFrame({
    "ID": img_ids,
    "label": predicted_labels
})

# Xem thử kết quả
print(results_df.head())

# Lưu ra CSV nếu muốn
results_df.to_csv("predictions.csv", index=False)


              ID                   label
0  Image_747.jpg         ELBOWED PIERROT
1  Image_561.jpg  MILBERTS TORTOISESHELL
2  Image_345.jpg              PINE WHITE
3  Image_844.jpg        MANGROVE SKIPPER
4  Image_270.jpg      CHECQUERED SKIPPER
