In [322]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, UnidentifiedImageError
from PIL import Image, ImageOps
from tensorflow.keras import layers, models
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import tensorflow as tf
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [323]:
import os
from PIL import Image, UnidentifiedImageError

# 图片所在的根目录
root_dir = 'trafficsigns_dataset'  # 请替换成实际的路径

# 用于存储按两层标签分隔的图片的嵌套字典
images_by_label = {}

# 用于遍历数据集并加载图像及其标签的函数
def load_images_and_labels(root_dir):
    # 遍历根目录下的每个子目录（第一层）
    for sub_dir in os.listdir(root_dir):
        sub_dir_path = os.path.join(root_dir, sub_dir)
        
        # 如果子目录确实是一个目录
        if os.path.isdir(sub_dir_path):
            # 初始化第一层标签键
            images_by_label[sub_dir] = {}
            
            # 再次遍历该子目录下的文件夹（第二层，具体的标签）
            for label_dir in os.listdir(sub_dir_path):
                label_dir_path = os.path.join(sub_dir_path, label_dir)
                
                # 如果第二层也是一个目录
                if os.path.isdir(label_dir_path):
                    # 初始化第二层标签键
                    images_by_label[sub_dir][label_dir] = []
                    
                    # 遍历第二层目录下的所有图像文件
                    for image_filename in os.listdir(label_dir_path):
                        # 忽略 .DS_Store 文件
                        if image_filename == '.DS_Store':
                            continue
                        image_path = os.path.join(label_dir_path, image_filename)
                        try:
                            images_by_label[sub_dir][label_dir].append(image_path)
                        except UnidentifiedImageError:
                            # 如果无法识别图像，打印出错信息
                            print(f"Cannot identify image file '{image_path}'")

# 载入图像数据和标签
load_images_and_labels(root_dir)

# 打印每个类别的图像数量
for sub_dir, labels in images_by_label.items():
    for label, images in labels.items():
        print(f"{sub_dir}/{label}: {len(images)} images")


diamond/rightofway: 282 images
hex/stop: 43 images
square/laneend: 118 images
square/parking: 276 images
square/continue: 199 images
square/crossing: 95 images
triangle/giveway: 231 images
round/traveldirection: 124 images
round/limitedtraffic: 125 images
round/speed: 316 images
round/roundabout: 98 images
round/noentry: 375 images
round/noparking: 242 images
round/bicycle: 285 images
round/trafficdirective: 195 images


28no Unusual

In [324]:
def get_image_sizes(root_dir):
    sizes = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith('.png') or file.lower().endswith('.jpg'):
                try:
                    with Image.open(os.path.join(subdir, file)) as img:
                        sizes.append(img.size)
                except (IOError, UnidentifiedImageError):
                    continue
    return sizes

image_sizes = get_image_sizes(root_dir)

sizes_np = np.array(image_sizes)
min_size = sizes_np.min(axis=0)
max_size = sizes_np.max(axis=0)
mean_size = sizes_np.mean(axis=0)
std_dev_size = sizes_np.std(axis=0)
median_size = np.median(sizes_np, axis=0)

min_size, max_size, mean_size, std_dev_size, median_size

(array([28, 28]),
 array([28, 28]),
 array([28., 28.]),
 array([0., 0.]),
 array([28., 28.]))

In [325]:
train_data = {}
test_data = {}
validation_data = {}

for first_level, second_level_dict in images_by_label.items():
    train_data[first_level] = {}
    validation_data[first_level] = {}
    test_data[first_level] = {}
    
    for label, image_paths in second_level_dict.items():
        temp_images, test_images = train_test_split(image_paths, test_size=0.2, random_state=42)
        train_images, val_images = train_test_split(temp_images, test_size=0.2, random_state=42)
        
        # Save split data
        train_data[first_level][label] = train_images
        test_data[first_level][label] = test_images
        validation_data[first_level][label] = val_images


In [326]:
def create_dataframe_for_16_class(data_dict):
    rows = []
    for first_level, second_level_dict in data_dict.items():
        for label, image_paths in second_level_dict.items():
            for path in image_paths:
                rows.append({'ImagePath': path, 'Label': label})
    return pd.DataFrame(rows)

def create_dataframe_for_5_class(data_dict):
    rows = []
    for first_level, second_level_dict in data_dict.items():
        for label, image_paths in second_level_dict.items():
            for path in image_paths:
                rows.append({'ImagePath': path, 'Label': first_level})
    return pd.DataFrame(rows)

train_df = create_dataframe_for_16_class(train_data)
test_df = create_dataframe_for_16_class(test_data)
validation_df = create_dataframe_for_16_class(validation_data)

In [327]:
batch_size = 64

In [328]:
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),  
    batch_size=batch_size,
    class_mode='categorical',  # 如果是多分类问题
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=validation_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),
    batch_size=batch_size,
    class_mode='categorical',  # 如果是多分类问题
    shuffle=False
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),
    batch_size=batch_size,
    class_mode='categorical',  # 如果是多分类问题
    shuffle=False
)

Found 2358 validated image filenames belonging to 16 classes.
Found 596 validated image filenames belonging to 16 classes.
Found 745 validated image filenames belonging to 16 classes.


## Transfer data to suit scikit-learn

In [329]:
import numpy as np

def get_data_from_generator(generator):
    batches = []
    labels = []
    # 迭代生成器收集数据和标签
    for batch, label in generator:
        batches.append(batch)
        labels.append(label)
        if len(batches) >= len(generator):
            break  # 确保不会无限迭代
    return np.vstack(batches), np.vstack(labels)


X_train, y_train = get_data_from_generator(train_generator)
X_val, y_val = get_data_from_generator(val_generator)
X_test, y_test = get_data_from_generator(test_generator)

## Decision Tree

In [330]:
# 假设X_train和X_val原始形状为[样本数, 28, 28, 1] - 例如从图像生成器中提取的

# 重塑X_train和X_val为二维数组，每个图像一行
X_train = X_train.reshape(X_train.shape[0], -1)  # -1会根据剩余维度计算所需大小
X_val = X_val.reshape(X_val.shape[0], -1)

# 如果使用get_data_from_generator函数提取数据，确保先调整函数或数据后重塑
# 示例，确保数据从生成器正确提取和转换后
X_train, y_train = get_data_from_generator(train_generator)
X_train = X_train.reshape(X_train.shape[0], -1)

X_val, y_val = get_data_from_generator(val_generator)
X_val = X_val.reshape(X_val.shape[0], -1)

# 训练决策树模型
tree_model = DecisionTreeClassifier(max_depth=10)
tree_model.fit(X_train, y_train)

# 在验证集上评估模型
y_pred_val = tree_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))

Validation Accuracy: 0.7466442953020134


## Random Forest

In [331]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 初始化随机森林模型
rf_model = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=42)

# 训练模型
rf_model.fit(X_train, y_train)

# 使用验证集进行模型评估
y_pred_val = rf_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))

Validation Accuracy: 0.7684563758389261


## SVM

In [332]:
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# 假设 y_train 和 y_val 是 one-hot 编码的，形状为 (样本数, 类别数)
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)

# 初始化并应用标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 初始化 SVM 模型
svm_model = SVC(kernel='rbf', C=1.0, gamma='auto', max_iter=1000)

# 训练 SVM 模型
svm_model.fit(X_train_scaled, y_train)

# 在验证集上评估 SVM 模型
y_pred_val = svm_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))

Validation Accuracy: 0.9379194630872483


## MLP Baseline Model for Multi-class Task


In [333]:
num_classes = 16
input_shape = (28, 28)
baseline_categorical = tf.keras.models.Sequential([
    tf.keras.Input(shape=input_shape),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [334]:
baseline_categorical.compile(loss='categorical_crossentropy',
                        optimizer=tf.keras.optimizers.Adam(),
                        metrics=['accuracy'])

In [335]:
epochs = 50
history_baseline_categorical = baseline_categorical.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)

Epoch 1/50


  self._warn_if_super_not_called()


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 53ms/step - accuracy: 0.3657 - loss: 2.1895 - val_accuracy: 0.7651 - val_loss: 1.0377
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.7660 - loss: 0.9662 - val_accuracy: 0.8859 - val_loss: 0.6385
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8624 - loss: 0.6197 - val_accuracy: 0.8943 - val_loss: 0.5007
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8979 - loss: 0.4958 - val_accuracy: 0.9111 - val_loss: 0.4385
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9011 - loss: 0.4347 - val_accuracy: 0.9329 - val_loss: 0.3573
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9323 - loss: 0.3251 - val_accuracy: 0.9245 - val_loss: 0.3512
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━

## Modify images to make classify more difficult

In [336]:
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,  # 旋转范围（角度），可以调整范围，如 20-40
    width_shift_range=0.1,  # 横向平移范围
    height_shift_range=0.1,  # 纵向平移范围
    shear_range=0.1,  # 剪切范围
    zoom_range=0.1,  # 缩放范围
    horizontal_flip=True,  # 水平翻转
)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=validation_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='ImagePath',
    y_col='Label',
    color_mode='grayscale',
    target_size=(28, 28),
    batch_size=batch_size,
    class_mode='categorical',  
    shuffle=False
)

Found 2358 validated image filenames belonging to 16 classes.
Found 596 validated image filenames belonging to 16 classes.
Found 745 validated image filenames belonging to 16 classes.


## SVM Model with modified dataset

In [337]:
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def get_data_from_generator(generator):
    batches = []
    labels = []
    # 迭代生成器收集数据和标签
    for batch, label in generator:
        batches.append(batch)
        labels.append(label)
        if len(batches) >= len(generator):
            break  # 确保不会无限迭代
    return np.vstack(batches), np.vstack(labels)

X_train, y_train = get_data_from_generator(train_generator)
X_val, y_val = get_data_from_generator(val_generator)
X_test, y_test = get_data_from_generator(test_generator)

# 重塑数据以适应 StandardScaler 的输入需求
X_train = X_train.reshape(X_train.shape[0], -1)
X_val = X_val.reshape(X_val.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# 假设 y_train 和 y_val 是 one-hot 编码的，形状为 (样本数, 类别数)
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)

# 初始化并应用标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 初始化 SVM 模型
svm_model = SVC(kernel='rbf', C=1.0, gamma='auto', max_iter=1000)

# 训练 SVM 模型
svm_model.fit(X_train_scaled, y_train)

# 在验证集上评估 SVM 模型
y_pred_val = svm_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))


Validation Accuracy: 0.7063758389261745


## MLP Model with modified dataset

In [338]:
num_classes = 16
input_shape = (28, 28)
baseline_categorical = tf.keras.models.Sequential([
    tf.keras.Input(shape=input_shape),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

In [339]:
baseline_categorical.compile(loss='binary_crossentropy',
                        optimizer=tf.keras.optimizers.Adam(),
                        metrics=['accuracy'])

In [340]:
epochs = 50
history_baseline_categorical = baseline_categorical.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)

Epoch 1/50


  self._warn_if_super_not_called()


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.1752 - loss: 0.3059 - val_accuracy: 0.3154 - val_loss: 0.2028
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.3131 - loss: 0.2156 - val_accuracy: 0.4413 - val_loss: 0.1769
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.3754 - loss: 0.1986 - val_accuracy: 0.5369 - val_loss: 0.1641
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4152 - loss: 0.1881 - val_accuracy: 0.5772 - val_loss: 0.1541
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.4211 - loss: 0.1828 - val_accuracy: 0.5487 - val_loss: 0.1528
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.4593 - loss: 0.1741 - val_accuracy: 0.6057 - val_loss: 0.1424
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━

## CNN Model for Multi Class

In [341]:
from tensorflow.keras import models, layers

num_classes = 16  # 根据你的任务调整类别数

cnn_model = models.Sequential([
    layers.Input(shape=(28, 28, 1)),  # 灰度图像输入，1 个通道
    layers.Conv2D(32, (3, 3), padding='same'),  # 第一卷积层，32 个滤波器
    layers.BatchNormalization(),  # 添加批归一化
    layers.Activation('relu'),  # 激活层
    layers.MaxPooling2D((2, 2)),  # 最大池化
    layers.Conv2D(64, (3, 3), padding='same'),  # 第二卷积层，64 个滤波器
    layers.BatchNormalization(),  # 添加批归一化
    layers.Activation('relu'),  # 激活层
    layers.MaxPooling2D((2, 2)),  # 最大池化
    layers.Conv2D(128, (3, 3), padding='same'),  # 第三卷积层，128 个滤波器
    layers.BatchNormalization(),  # 添加批归一化
    layers.Activation('relu'),  # 激活层
    layers.MaxPooling2D((2, 2)),  # 最大池化
    layers.Flatten(),  # 将卷积结果展平
    layers.Dense(512),  # 全连接层
    layers.BatchNormalization(),  # 添加批归一化
    layers.Activation('relu'),  # 激活层
    layers.Dense(num_classes, activation='softmax')  # 输出层，使用 softmax 激活
])

cnn_model.compile(
    loss='categorical_crossentropy',  # 使用多分类交叉熵损失
    optimizer=tf.keras.optimizers.Adam(),  # Adam 优化器
    metrics=['accuracy']  # 使用准确率作为指标
)


In [342]:
epochs = 50
history_cnn = cnn_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=epochs
)


Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 199ms/step - accuracy: 0.4646 - loss: 1.8736 - val_accuracy: 0.1191 - val_loss: 2.7801
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.8578 - loss: 0.4872 - val_accuracy: 0.0822 - val_loss: 3.3375
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.8991 - loss: 0.3150 - val_accuracy: 0.0772 - val_loss: 4.1155
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9279 - loss: 0.2529 - val_accuracy: 0.0856 - val_loss: 3.4767
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9408 - loss: 0.2136 - val_accuracy: 0.1242 - val_loss: 3.6376
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.9454 - loss: 0.1680 - val_accuracy: 0.2148 - val_loss: 3.3484
Epoch 7/50
[1m37/37[0m [32m━━