In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from DL_Models import DLModels
from logging_config import setup_logger
from tools import save_json, load_json



In [2]:
# 自定义数据集类，用于处理二维特征输入
class TabularDataset(Dataset):
    def __init__(self, X, Y, transform=None):
        self.X = X.values
        self.Y = Y.values
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]
        # 升维处理，将特征转换为适合卷积模型的输入格式 (C, H, W)
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(1).repeat(3, 1, 1)
        if self.transform:
            x = self.transform(x)
        return x, torch.tensor(y, dtype=torch.float32)

# 转换为适合预训练模型输入的格式
def transform_input(X):
    transform = transforms.Compose([
        transforms.Normalize(mean=[0.5] * 3, std=[0.5] * 3)
    ])
    return transform

# 创建数据集和数据加载器
def create_data_loaders(X, Y, batch_size=1024):
    dataset = TabularDataset(X, Y)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader


# 读取数据
def load_data(file_path):
    df = pd.read_csv(file_path)
    X = df.iloc[:, 1:38]
    Y = df.iloc[:, 38:]
    return X, Y

In [3]:
# 自定义损失函数，用于处理数据不平衡问题
class CustomBCELoss(nn.Module):
    def __init__(self, pos_weight):
        super(CustomBCELoss, self).__init__()
        self.pos_weight = pos_weight

    def forward(self, outputs, targets):
        outputs, targets = outputs.to(self.pos_weight.device), targets.to(self.pos_weight.device)
        loss = nn.functional.binary_cross_entropy_with_logits(outputs, targets, pos_weight=self.pos_weight)
        return loss

In [4]:

# 训练过程
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, targets in tqdm(train_loader, desc="Training"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(train_loader)
    return train_loss

# 验证过程
def validate_epoch(model, test_loader, criterion, device):
    model.eval()
    val_running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in tqdm(test_loader, desc="Validation"):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            val_loss = criterion(outputs, targets)
            val_running_loss += val_loss.item()
            preds = torch.sigmoid(outputs).round()
            correct += (preds == targets).sum().item()
            total += targets.numel()
    val_loss = val_running_loss / len(test_loader)
    val_accuracy = correct / total
    return val_loss, val_accuracy

# 训练模型并添加CUDA和tqdm可视化
def train_model(model, train_loader, test_loader, criterion, optimizer, logger, model_name, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = criterion.to(device)

    train_losses = []
    val_losses = []
    val_accuracies = []

    best_val_accuracy = 0.0
    best_model_path = f'DL3_output/{model_name}_best_model_DL3.pth'

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_accuracy = validate_epoch(model, test_loader, criterion, device)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        logger.info(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        # 保存最优模型
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), best_model_path)
            logger.info(f'Best model saved with accuracy: {best_val_accuracy:.4f}')
    return  train_losses, val_losses, val_accuracies

In [5]:
# 读取数据
X, Y = load_data('trin_df.csv')  # 请替换为你的数据集路径
epochs = 800
# 创建数据加载器
batch_size = 1024
train_loader, test_loader = create_data_loaders(X, Y, batch_size)

# 定义模型
output_dim = Y.shape[1]
models = {'resnet18':DLModels.resnet18(output_dim),
          'resnet50':DLModels.resnet50(output_dim),
        #   'vgg16':DLModels.vgg16(output_dim),
        #   'densenet121':DLModels.densenet121(output_dim),
        #  'mobilenet_v2':DLModels.mobilenet_v2(output_dim)
        }

logger = setup_logger(rf'DL3_output/DL3.log')

for model_name in list(models.keys()):
    results = {}
    model = models[model_name]

    # 损失函数和优化器
    # 计算每列中正样本的权重，用于平衡损失函数中的正负样本
    pos_weight = torch.tensor((Y.shape[0] - Y.sum()) / Y.sum(), dtype=torch.float32)
    criterion = CustomBCELoss(pos_weight=pos_weight)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 训练模型
    train_losses, val_losses, val_accuracies = train_model(model, train_loader, test_loader, criterion, optimizer, logger,model_name, epochs)

    results[model_name] = (train_losses, val_losses, val_accuracies)

    save_json(results,rf'DL3_output/DL3_{model_name}_results.json')

  pos_weight = torch.tensor((Y.shape[0] - Y.sum()) / Y.sum(), dtype=torch.float32)
Training: 100%|██████████| 189/189 [02:00<00:00,  1.57it/s]
Validation: 100%|██████████| 48/48 [00:19<00:00,  2.44it/s]
2024-10-31 21:46:40,369 - logging_config - INFO - Epoch [1/800], Train Loss: 1.3871, Val Loss: 1.3905, Val Accuracy: 0.5873
2024-10-31 21:46:40,630 - logging_config - INFO - Best model saved with accuracy: 0.5873
Training: 100%|██████████| 189/189 [01:51<00:00,  1.70it/s]
Validation: 100%|██████████| 48/48 [00:21<00:00,  2.19it/s]
2024-10-31 21:48:53,708 - logging_config - INFO - Epoch [2/800], Train Loss: 1.4011, Val Loss: 1.3881, Val Accuracy: 0.6323
2024-10-31 21:48:54,020 - logging_config - INFO - Best model saved with accuracy: 0.6323
Training: 100%|██████████| 189/189 [01:54<00:00,  1.65it/s]
Validation: 100%|██████████| 48/48 [00:20<00:00,  2.40it/s]
2024-10-31 21:51:08,752 - logging_config - INFO - Epoch [3/800], Train Loss: 1.3701, Val Loss: 1.3944, Val Accuracy: 0.6343
2024-10

KeyboardInterrupt: 

In [3]:

# 可视化训练和验证结果
def plot_metrics(epochs, train_losses, val_losses, val_accuracies):
    plt.figure(figsize=(12, 5))

    # 绘制训练和验证损失
    plt.subplot(1, 2, 1)
    plt.plot(range(1, epochs + 1), train_losses, label='Train Loss')
    plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Train and Validation Loss')
    plt.legend()

    # 绘制验证准确率
    plt.subplot(1, 2, 2)
    plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy')
    plt.legend()

    plt.show()

In [4]:
plot_metrics(epochs, train_losses, val_losses, val_accuracies)

NameError: name 'epochs' is not defined

自定义的损失函数 `CustomBCELoss` 使用的是带有权重的二元交叉熵损失 (`binary_cross_entropy_with_logits`) 来处理数据不平衡问题。它的设计和工作原理如下：

### 数据不平衡问题
在分类任务中，尤其是二分类任务中，如果正样本和负样本的数量差异很大（即数据不平衡），标准的损失函数可能会导致模型偏向于预测数量较多的类别，从而忽略数量较少的类别。这种情况会导致模型在测试集上对少数类别的预测效果较差。

### 如何处理数据不平衡
`CustomBCELoss` 通过引入 **正样本权重 (pos_weight)** 来处理数据不平衡问题。`pos_weight` 是一个权重参数，用于在计算交叉熵损失时增加正样本的损失贡献，从而平衡正负样本对总损失的影响。

具体而言：
- 当样本中的正负类别数量差异较大时，通过增加正样本的权重，使得正样本的错误预测对总损失的影响变大，从而引导模型更加重视正样本的正确分类。
- 权重的计算方式为：`pos_weight = (总样本数 - 正样本数) / 正样本数`，即负样本和正样本的比例。

### 实现原理
1. **初始化时设置权重**：
   在 `CustomBCELoss` 的初始化函数中，接受 `pos_weight` 参数：
   ```python
   class CustomBCELoss(nn.Module):
       def __init__(self, pos_weight):
           super(CustomBCELoss, self).__init__()
           self.pos_weight = pos_weight
   ```
   `pos_weight` 是一个张量，它的值根据正负样本比例计算得到。

2. **前向传播时使用 `binary_cross_entropy_with_logits`**：
   在前向传播中使用 PyTorch 提供的 `binary_cross_entropy_with_logits` 函数来计算损失：
   ```python
   def forward(self, outputs, targets):
       outputs, targets = outputs.to(self.pos_weight.device), targets.to(self.pos_weight.device)
       loss = nn.functional.binary_cross_entropy_with_logits(outputs, targets, pos_weight=self.pos_weight)
       return loss
   ```
   - `binary_cross_entropy_with_logits` 计算二元交叉熵损失，并且可以直接接受 logits（未经过 sigmoid 的原始输出），这样可以提高数值稳定性。
   - `pos_weight` 参数用于给正样本的损失赋予额外的权重，使得正样本的损失更大，以此来平衡正负样本。

### 损失函数的作用
`pos_weight` 的作用是在损失函数中对正样本赋予额外的权重。这意味着：
- **如果 `pos_weight > 1`**：表示正样本在数据集中相对稀少，因此我们要增加它的损失权重，以使模型更加重视正样本。
- **如果 `pos_weight = 1`**：表示数据集中的正负样本数量大致相等，此时不会对损失进行任何额外的加权。
- **如果 `pos_weight < 1`**：这意味着正样本的数量多于负样本（这种情况较为罕见），可以降低正样本的损失权重。

这种方法在不平衡数据集上训练分类模型时，可以有效地减少类别偏差问题，提高模型对少数类的识别能力。

### 总结
自定义的损失函数 `CustomBCELoss` 通过对正样本的损失加权，解决了样本不平衡的问题，使得模型在训练时对少数类别的样本更加重视，从而提高在不平衡数据集上的性能。权重的引入确保了模型不会因为多数类的压倒性数量而忽略少数类，从而使得分类结果更加平衡。