In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [23]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [84]:
data_df = pd.read_csv('/content/gdrive/MyDrive/csv_file_for_dataset-HAM10000/HAM10000_metadata.csv')

In [85]:
data_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear
...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face


In [86]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB


In [87]:
#Thay thế các giá trị thiếu trong cột age = giá trị trung bình
mean_age = round(data_df['age'].mean(), 0)
data_df['age'] = data_df['age'].fillna(mean_age)

In [88]:
for col in data_df.columns :
  missing_data = data_df[col].isnull().sum()
  missing_percent = (missing_data/len(data_df))*100
  print(f' Column {col} has : {missing_percent.round(2)}% missing data')

 Column lesion_id has : 0.0% missing data
 Column image_id has : 0.0% missing data
 Column dx has : 0.0% missing data
 Column dx_type has : 0.0% missing data
 Column age has : 0.0% missing data
 Column sex has : 0.0% missing data
 Column localization has : 0.0% missing data


In [89]:
# 2. Chia Train (70%) và Temp (30%)
train_df, temp_df = train_test_split(
    data_df,
    test_size=0.30,
    stratify=data_df['dx'],
    random_state=0
)

# 3. Chia Temp thành Validation (15%) và Test (15%)
# 15/30 = 0.5
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df['dx'],
    random_state=0
)

# 4. Lưu file
train_df['image_id'] = train_df['dx'] + "/" + train_df['image_id'] + ".jpg"
val_df['image_id'] = val_df['dx'] + "/" + val_df['image_id'] + ".jpg"
test_df['image_id'] = test_df['dx'] + "/" + test_df['image_id'] + ".jpg"
train_df.to_csv("train_df.csv", index=False)
val_df.to_csv("val_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)

In [90]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory="/content/gdrive/MyDrive/Dataset-HAM10000",
    x_col="image_id",
    y_col="dx",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    shuffle=True
)

test_val_datagen = ImageDataGenerator(rescale=1./255)

# ------------------ VALIDATION GENERATOR ------------------
val_generator = test_val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory="/content/gdrive/MyDrive/Dataset-HAM10000/",
    x_col="image_id",
    y_col="dx",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    shuffle=False                 # Không shuffle để giữ thứ tự
)

# ------------------ TEST GENERATOR ------------------
test_generator = test_val_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory="/content/gdrive/MyDrive/Dataset-HAM10000/",
    x_col="image_id",
    y_col="dx",
    target_size=(224, 224),
    batch_size=32,
    class_mode="categorical",
    shuffle=False                 # Không shuffle để dự đoán chính xác
)


Found 7010 validated image filenames belonging to 7 classes.
Found 1502 validated image filenames belonging to 7 classes.
Found 1503 validated image filenames belonging to 7 classes.
