## 判断是否纯净文件夹的模型训练


|关键点|优化方法|
|---|---|
|**数据预处理**|HSV 色彩增强、边缘检测|
|**数据增强**|纯净图案：仿射变换；非纯净图案：遮挡+噪声|
|**模型微调**|冻结 VGG16 底层，微调高层|
|**评估**|混淆矩阵 + 可视化预测|
### 面临挑战

按用途分类时的挑战
1. 许多被标记为纯净的图案实际并不纯净， 上面一般会有均匀的形状 纹理点缀
2. 有的形状较为特征， 为半折叠的文件 如图片，这种作为纯净背景时，主体一般镶嵌在中间，如何让机器理解这类图片哪种是纯净，哪种是已镶嵌好像有点困难

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

import os
import shutil

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 挂载 Google Drive
base_dir = '/content/drive/MyDrive/data/folder-icon-images/'  # Google Drive 中的图像文件夹路径
drive_train_validate_dir = base_dir + "train_validate"

### 准备数据

In [None]:
local_image_dir = "/content/data/is_pure/"
standard_dir = local_image_dir + 'pure'  # 新建的 "standard" 文件夹
non_standard_dir = local_image_dir + 'non_pure'  # 新建的 "non-standard" 文件夹

### 数据处理与划分

#### 图像颜色和纹理增强

In [None]:
import cv2
def preprocess_image(image):
    # 转为 HSV 空间
    hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
    # 对 S 和 V 通道做直方图均衡化（需先转换为单通道）
    hsv[:,:,1] = cv2.equalizeHist(hsv[:,:,1].astype('uint8'))  # 饱和度通道
    hsv[:,:,2] = cv2.equalizeHist(hsv[:,:,2].astype('uint8'))  # 亮度通道
    # 转回 RGB
    image = cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
    return image

# 使用灰度图像即可，只关心纹路， 不关心颜色
def preprocess_gray_image(image):
    # 转换为灰度图像
    gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    return gray_image

In [None]:
# 设置数据增强
pure_datagen = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='reflect',
    preprocessing_function=preprocess_image,  # 自定义预处理
    validation_split=0.3
)

# 训练集生成器
train_generator = pure_datagen.flow_from_directory(
    local_image_dir,
    target_size=(224, 224),  # VGG16 需要 224x224，将原图转成224*224
    batch_size=32,
    class_mode='binary',  # 二分类用 binary，多分类用 categorical
    subset='training'  # 指定是训练集
)

# 验证集生成器
validation_generator = pure_datagen.flow_from_directory(
    local_image_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation'  # 指定是验证集
)
print(f"找到的训练样本数: {train_generator.samples}")
print(f"找到的验证样本数: {validation_generator.samples}")
images, labels = next(train_generator)
print("图像形状:", images.shape)  # 应为 (batch_size, 224, 224, 3)
print("标签形状:", labels.shape)

### 模型创建

In [None]:
# 加载 VGG16，冻结前 15 层（只训练后面几层）
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.optimizers import AdamW

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3 ))
for layer in base_model.layers[:100]:  # 冻结部分层
    layer.trainable = False
# 构建二分类模型
model = Sequential([
    base_model,
    Flatten(),
    Dense(1024, activation='relu', kernel_regularizer=regularizers.l2(0.01)),  # 增大 L2 系数
    Dropout(0.6),  # 提高 Dropout 比率
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

optimizer = AdamW(learning_rate=1e-4, weight_decay=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

### 模型训练

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
# 获取训练集的真实标签（需确保 train_generator.classes 是整数标签 0/1）
y_train = train_generator.classes
# 计算类别权重（classes 需是 NumPy 数组）
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),  # 自动提取唯一类别（如 [0, 1]）
    y=y_train
)
# 转换为字典格式
class_weights = {i: weight for i, weight in enumerate(class_weights)}
print("类别权重:", class_weights)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
]
history = model.fit(
    train_generator,
    epochs=30,
    validation_data=validation_generator,
    class_weight=class_weights  # 如果数据不均衡
)

In [None]:
drive_test_pure_dir = base_dir + "test_pure_classification"

In [None]:
# 使用模型进行预测
for item in os.listdir(drive_test_pure_dir):
    item_path = os.path.join(drive_test_pure_dir, item)
    if os.path.isfile(item_path):
        prediction = predict_image(model, item_path)  # 传递正确的路径
        if prediction > 0.5:
            # 输出预测结果
            print(f"{item_path} Prediction:", prediction)

### 模型预测

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# 从测试集取一批数据
test_images, test_labels = next(validation_generator)
predictions = model.predict(test_images)
# 显示预测结果
plt.figure(figsize=(10, 10))
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.imshow(test_images[i])
    pred = "pure" if predictions[i] > 0.5 else "np"
    true = "pure" if test_labels[i] == 1 else "np"
    plt.title(f"predict: {pred}\nTrue: {true}")
    plt.axis('off')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
# 获取所有测试数据预测
y_pred = model.predict(validation_generator)
y_pred = (y_pred > 0.5).astype(int)
y_true = validation_generator.classes
# 绘制混淆矩阵
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测')
plt.ylabel('真实')
plt.show()

预测结果很不理想，最大原因可能在于数据集标签的问题