In [1]:
import os
import torch
import timm
#os.system("pip install --upgrade timm")
import pandas as pd
import torch.nn as nn
from torch.optim import AdamW
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from timm.models import create_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from PIL import Image
from timm.data import Mixup
from torchvision.transforms import RandomErasing

In [2]:
# 檢查是否有可用的 GPU，否則使用 CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
# ============================
# 2. 自定義數據集 (讀取訓練數據)
# ============================
# 定義數據集路徑
num_classes = 15
IMAGE_SIZE = 336
BATCH_SIZE = 8
IMAGE_DIR = "/kaggle/input/2024-deep-learning-final-project/train_images"
CSV_PATH = "/kaggle/input/2024-deep-learning-final-project/train_set.csv"
OUTPUT_DIR = "/kaggle/working/"
# 讀取 CSV 文件
data_df = pd.read_csv(CSV_PATH)

# 添加圖片完整路徑
data_df['file_path'] = data_df['filename'].apply(lambda x: os.path.join(IMAGE_DIR, x))

# 創建 Label 字典
unique_labels = data_df[['label']].drop_duplicates().sort_values('label').reset_index(drop=True)
label_map = {label: idx for idx, label in enumerate(unique_labels['label'].unique())}  # 類別 -> 數字
reverse_label_map = {v: k for k, v in label_map.items()}  # 數字 -> 類別

# 替換 DataFrame 中的文字標籤為數字
data_df['label'] = data_df['label'].map(label_map)

# 打印 Label 字典
print("label 字典:")
print(label_map)

# 分割數據集
train_df, val_df = train_test_split(
    data_df, test_size=0.1, random_state=42, stratify=data_df['label']
)

# 自定義數據集類別
class HARImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image = Image.open(row['file_path']).convert("RGB")
        label = row['label']
        if self.transform:
            image = self.transform(image)
        return image, label

# 數據增強與處理
train_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),   ###### 20->30
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
    transforms.RandomResizedCrop(size=IMAGE_SIZE, scale=(0.7, 1.0)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]), # For eva02_enormous_patch14_clip_224 #####################
    RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3))
])

val_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #####################
])

# 創建數據集與 DataLoader
train_dataset = HARImageDataset(train_df, transform=train_transform)
val_dataset = HARImageDataset(val_df, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
# No phone: cycling dancing

label 字典:
{'calling': 0, 'clapping': 1, 'cycling': 2, 'dancing': 3, 'drinking': 4, 'eating': 5, 'fighting': 6, 'hugging': 7, 'laughing': 8, 'listening_to_music': 9, 'running': 10, 'sitting': 11, 'sleeping': 12, 'texting': 13, 'using_laptop': 14}


In [4]:
mixup_fn = Mixup(
    mixup_alpha=0.2,       # MixUp 混合強度
    cutmix_alpha=0.6,      # CutMix 混合強度
    prob=0.7,              # 應用 MixUp/CutMix 的概率
    switch_prob=0.5,       # MixUp 和 CutMix 之間的切換概率
    label_smoothing=0.15,   # Label smoothing
    num_classes=15
)

In [5]:
# ============================
# 3. 模型定義
# ============================
# swin_large_patch4_window7_224        : 91.16%
# swin_large_patch4_window12_384       : 91.56%
# vit_large_patch16_224                : 90.60% 
# swinv2_large_window12to16_192to256   : 91%
# convnext_large.fb_in22k_ft_in1k_384  : 90.12%
# convnext_xlarge.fb_in22k_ft_in1k_384 : 90.92%
# beitv2_large_patch16_224             : 90.8%
model_name = "eva02_large_patch14_clip_336"         

# 創建模型
model = create_model(model_name, pretrained=True, num_classes=num_classes)          ############# False
model = model.to(device)

# 查看預訓練的配置
pretrained_cfg = model.pretrained_cfg
print(pretrained_cfg)

# ============================
# 4. 損失函數與優化器
# ============================
criterion = nn.CrossEntropyLoss(label_smoothing=0.2)
optimizer = AdamW(model.parameters(), lr=2e-6, weight_decay=1e-3)      # swin_large_patch4_window7_224 : 5e-5
scheduler = CosineAnnealingLR(optimizer, T_max=25)

open_clip_model.safetensors:   0%|          | 0.00/856M [00:00<?, ?B/s]

{'url': '', 'hf_hub_id': 'timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', 'hf_hub_filename': 'open_clip_pytorch_model.bin', 'architecture': 'eva02_large_patch14_clip_336', 'tag': 'merged2b', 'custom_load': False, 'input_size': (3, 336, 336), 'fixed_input_size': True, 'interpolation': 'bicubic', 'crop_pct': 1.0, 'crop_mode': 'center', 'mean': (0.48145466, 0.4578275, 0.40821073), 'std': (0.26862954, 0.26130258, 0.27577711), 'num_classes': 768, 'pool_size': None, 'first_conv': 'patch_embed.proj', 'classifier': 'head', 'license': 'mit'}


In [7]:
# ============================
# 5. 訓練與驗證
# ============================
epochs = 10
best_accuracy = 0.0

for epoch in range(epochs):
    # 訓練模式
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        images, labels = images.to(device), labels.to(device)

        # 應用 MixUp 或 CutMix
        if mixup_fn is not None:
            images, labels = mixup_fn(images, labels)

        # 前向傳播
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # 驗證模式
    model.eval()
    val_loss = 0.0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    # 計算驗證集準確率
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss / len(val_loader):.4f}, Accuracy: {accuracy:.4f}")

    # 儲存最佳模型
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "best.pth"))
        print(f"Saved Best Model with Accuracy: {best_accuracy:.4f}")

    # 調整學習率
    scheduler.step()

print("訓練完成，最佳模型已保存！")

Epoch 1 Training:   0%|          | 0/1137 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# =============================================================================
#                                微調模型
# =============================================================================
epochs = 10
best_accuracy = 0.0

MODEL_NAME = "eva02_large_patch14_clip_336"
MODEL_PATH = "/kaggle/input/eva_336_0.9334/pytorch/default/1/eva336_0.933.pth"
model = create_model(MODEL_NAME, pretrained=False, num_classes=15)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device, weights_only=True))
model = model.to(device)

# ============================
# 1. 損失函數與優化器
# ============================
criterion = nn.CrossEntropyLoss(label_smoothing=0.15)
optimizer = AdamW(model.parameters(), lr=5e-7, weight_decay=8e-4)      # swin_large_patch4_window7_224 : 5e-5
scheduler = CosineAnnealingLR(optimizer, T_max=25)

for epoch in range(epochs):
    # 訓練模式
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        images, labels = images.to(device), labels.to(device)

        # 應用 MixUp 或 CutMix
        if mixup_fn is not None:
            images, labels = mixup_fn(images, labels)

        # 前向傳播
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # 驗證模式
    model.eval()
    val_loss = 0.0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predicted.cpu().numpy())

    # 計算驗證集準確率
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss / len(val_loader):.4f}, Accuracy: {accuracy:.4f}")

    # 儲存最佳模型
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "best.pth"))
        print(f"Saved Best Model with Accuracy: {best_accuracy:.4f}")

    # 調整學習率
    scheduler.step()

print("訓練完成，最佳模型已保存！")

Epoch 1 Training:   0%|          | 0/1137 [00:00<?, ?it/s]

Epoch 1 Validation:   0%|          | 0/127 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 1.5002, Val Loss: 0.9209, Accuracy: 0.9525
Saved Best Model with Accuracy: 0.9525


Epoch 2 Training:   0%|          | 0/1137 [00:00<?, ?it/s]

Epoch 2 Validation:   0%|          | 0/127 [00:00<?, ?it/s]

Epoch 2/10, Train Loss: 1.5110, Val Loss: 0.9222, Accuracy: 0.9495


Epoch 3 Training:   0%|          | 0/1137 [00:00<?, ?it/s]

Epoch 3 Validation:   0%|          | 0/127 [00:00<?, ?it/s]

Epoch 3/10, Train Loss: 1.4939, Val Loss: 0.9220, Accuracy: 0.9515


Epoch 4 Training:   0%|          | 0/1137 [00:00<?, ?it/s]

In [9]:
# =============================================================================
#                                單一模型預測
# =============================================================================
# ============================
# 1. 設置參數
# ============================
TEST_DIR = "/kaggle/input/2024-deep-learning-final-project/test_images"
OUTPUT_FILE = "/kaggle/working/team_18_submission.csv"
MODEL_PATH = "/kaggle/working/best.pth"  # 已訓練的模型檢查點
#model_name = "eva02_large_patch14_clip_336"

# ============================
# 2. 數據增強
# ============================
test_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),  # 根據模型的輸入要求
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

# ============================
# 3. 加載模型
# ============================
print("Loading model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 創建與訓練時相同的模型
num_classes = len(reverse_label_map)  # 輸出類別數
model = create_model(model_name, pretrained=False, num_classes=num_classes)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device, weights_only=True))
model = model.to(device)
model.eval()

# ============================
# 4. 加載測試數據
# ============================
# 獲取測試圖片列表
test_images = sorted(os.listdir(TEST_DIR))
test_data = []

for img_name in test_images:
    img_path = os.path.join(TEST_DIR, img_name)
    img = Image.open(img_path).convert("RGB")
    img = test_transform(img)
    test_data.append((img_name, img))

# ============================
# 5. 預測
# ============================
print("Predicting")
results = []
with torch.no_grad():
    for img_name, img_tensor in tqdm(test_data, desc = "Predicting
    "):
        img_tensor = img_tensor.unsqueeze(0).to(device)  # 增加批次維度
        output = model(img_tensor)
        pred = torch.argmax(output, dim=1).item()  # 獲取預測類別索引
        label = reverse_label_map[pred]  # 將索引映射為類別名稱
        results.append({"filename": img_name.replace(".jpg", ""), "label": label})

# ============================
# 6. 保存為 CSV 文件
# ============================
submission_df = pd.DataFrame(results)
submission_df.to_csv(OUTPUT_FILE, index=False)

print(f"預測結果已保存到 {OUTPUT_FILE}")

Loading model
Predicting
預測結果已保存到 /kaggle/working/team_18_submission.csv


In [9]:
# ==========================================================================================================================
#                                                            Ensemble
# ==========================================================================================================================

# ============================
# 1. 加載模型
# ============================
TEST_DIR = "/kaggle/input/2024-deep-learning-final-project/test_images"
print("Loading models")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model(model_name, model_path):
    model = create_model(model_name, pretrained=False, num_classes=len(reverse_label_map))
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()
    return model

# 模型路徑與加載
MODEL1_PATH = "/kaggle/input/eva_336_0.9304/pytorch/default/1/eva02_336_0.9304.pth"
MODEL2_PATH = "/kaggle/input/eva02_224_0.924/pytorch/default/1/best.pth"
MODEL3_PATH = "/kaggle/input/eva_large_patch14_336_best/pytorch/default/1/eva_large_patch14_336_best.pth"
# eva02_large_patch14_448
model1 = load_model("eva02_large_patch14_clip_336", MODEL1_PATH)
model2 = load_model("eva02_large_patch14_clip_224", MODEL2_PATH) 
model3 = load_model("eva_large_patch14_336", MODEL3_PATH)

IMAGE_SIZE1 = 336
IMAGE_SIZE2 = 224
IMAGE_SIZE3 = 336
# ============================
# 2. 數據增強
# ============================
test_transform_model1 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE1, IMAGE_SIZE1)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

test_transform_model2 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE2,IMAGE_SIZE2)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

test_transform_model3 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE3,IMAGE_SIZE3)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

# ============================
# 3. 加載測試數據
# ============================
print("Preparing test data")
test_images = sorted(os.listdir(TEST_DIR))
test_data = []

for img_name in test_images:
    img_path = os.path.join(TEST_DIR, img_name)
    img = Image.open(img_path).convert("RGB")
    
    # 針對三個模型生成不同尺寸的圖像張量
    img_tensor1 = test_transform_model1(img)
    img_tensor2 = test_transform_model2(img)
    img_tensor3 = test_transform_model3(img)
    
    #test_data.append((img_name, img_tensor1, img_tensor2))
    test_data.append((img_name, img_tensor1, img_tensor2, img_tensor3))

# ============================
# 4. Ensemble 預測
# ============================
print("Predicting with ensemble")
results = []
with torch.no_grad():
    for img_name, img_tensor1, img_tensor2 , img_tensor3 in tqdm(test_data, desc = "ensemble"):
        # 增加 Batch 維度
        img_tensor1 = img_tensor1.unsqueeze(0).to(device)
        img_tensor2 = img_tensor2.unsqueeze(0).to(device)
        img_tensor3 = img_tensor3.unsqueeze(0).to(device)
        
        # 獲取模型輸出
        output1 = model1(img_tensor1)
        output2 = model2(img_tensor2)
        output3 = model3(img_tensor3)
        
        # Soft Voting: 加權平均三個模型的輸出分數
        ensemble_output = output1 * 0.75 + output2 * 0.125 + output3 * 0.125
        
        # 預測最終類別
        pred = torch.argmax(ensemble_output, dim=1).item()
        label = reverse_label_map[pred]
        
        results.append({"filename": img_name.replace(".jpg", ""), "label": label})

# ============================
# 5. 保存為 CSV 文件
# ============================
OUTPUT_FILE = "/kaggle/working/team_18_submission_ensemble.csv"
submission_df = pd.DataFrame(results)
submission_df.to_csv(OUTPUT_FILE, index=False)

print(f"Ensemble 預測結果已保存到 {OUTPUT_FILE}")

Loading models


  model.load_state_dict(torch.load(model_path, map_location=device))


Preparing test data
Predicting with ensemble


ensemble:   0%|          | 0/2500 [00:00<?, ?it/s]

Ensemble 預測結果已保存到 /kaggle/working/team_18_submission_ensemble.csv


In [21]:
# ==========================================================================================================================
#                                                            TTA
# ==========================================================================================================================

# ============================
# 1. 設置參數
# ============================
TEST_DIR = "/kaggle/input/2024-deep-learning-final-project/test_images"
OUTPUT_FILE = "/kaggle/working/team_18_submissionTTA.csv"

MODEL_PATH = "/kaggle/input/eva_336_0.9334/pytorch/default/1/eva336_0.933.pth"  # 已訓練的模型檢查點
IMAGE_SIZE = 336  # 輸入圖片大小
model_name = "eva02_large_patch14_clip_336"  # 使用的模型名稱

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================
# 2. 測試數據增強 (TTA)
# ============================
print("Setting up TTA transforms")

tta_transforms = [
    transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
    ]),
    transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.RandomHorizontalFlip(p=1.0),  # 水平翻轉
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
    ]),
    transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05),  # 輕量顏色增強
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
    ]),
    transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.RandomRotation(degrees=10),  # 旋轉 10 度
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
    ])
]

# ============================
# 3. 加載模型
# ============================
print("Loading model")
num_classes = 15  # 類別數量
model = create_model(model_name, pretrained=False, num_classes=num_classes)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device), strict=False)
model = model.to(device)
model.eval()

# ============================
# 4. 加載測試數據
# ============================
print("Loading test data")
test_images = sorted(os.listdir(TEST_DIR))
test_data = []

for img_name in test_images:
    img_path = os.path.join(TEST_DIR, img_name)
    img = Image.open(img_path).convert("RGB")
    test_data.append((img_name, img))

# ============================
# 5. TTA 預測
# ============================
print("Predicting with TTA")
results = []

with torch.no_grad():
    for img_name, img in tqdm(test_data, desc = "TTA"):
        outputs = []
        for transform in tta_transforms:
            img_tensor = transform(img).unsqueeze(0).to(device)  # 應用增強並增加批次維度
            output = model(img_tensor)
            outputs.append(output)

        # 平均 TTA 輸出的結果
        avg_output = torch.mean(torch.stack(outputs), dim=0)
        pred = torch.argmax(avg_output, dim=1).item()  # 獲取最終預測類別索引
        label = reverse_label_map[pred]
        
        results.append({"filename": img_name.replace(".jpg", ""), "label": label})

# ============================
# 6. 保存為 CSV 文件
# ============================
print("Saving predictions")
submission_df = pd.DataFrame(results)
submission_df.to_csv(OUTPUT_FILE, index=False)
print(f"預測結果已保存到 {OUTPUT_FILE}")

Setting up TTA transforms
Loading model


  model.load_state_dict(torch.load(MODEL_PATH, map_location=device), strict=False)


Loading test data
Predicting with TTA


TTA:   0%|          | 0/2500 [00:00<?, ?it/s]

Saving predictions
預測結果已保存到 /kaggle/working/team_18_submissionTTA.csv


In [None]:
# ==========================================================================================================================
#                                                            Ensemble + TTA(效益有限))
# ==========================================================================================================================

# ============================
# 1. 加載模型
# ============================
# 參數設置
TEST_DIR = "/kaggle/input/2024-deep-learning-final-project/test_images"
OUTPUT_FILE = "/kaggle/working/team_18_submissionTTA.csv"

MODEL_PATH_1 = "/kaggle/input/eva_336_0.9304/pytorch/default/1/eva02_336_0.9304.pth"
MODEL_PATH_2 = "/kaggle/input/eva02_224_best/pytorch/default/1/eva02_224_best.pth"
MODEL_PATH_3 = "/kaggle/input/eva02_448/pytorch/default/1/eva02_448.pth"

MODEL_NAME1= "eva02_large_patch14_clip_336"
MODEL_NAME2= "eva02_large_patch14_clip_224"
MODEL_NAME3= "eva02_large_patch14_448"

IMAGE_SIZE_1 = 336  # 模型1的輸入尺寸
IMAGE_SIZE_2 = 224  # 模型2的輸入尺寸
IMAGE_SIZE_3 = 448  # 模型3的輸入尺寸
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model(model_name, model_path):
    model = create_model(model_name, pretrained=False, num_classes=len(reverse_label_map))
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()
    return model
    
print("Load model......")

# 模型路徑與加載
MODEL1_PATH = "/kaggle/input/eva_336_0.9304/pytorch/default/1/eva02_336_0.9304.pth"
MODEL2_PATH = "/kaggle/input/eva02_224_best/pytorch/default/1/eva02_224_best.pth"
MODEL3_PATH = "/kaggle/input/eva02_448/pytorch/default/1/eva02_448.pth"

# ============================
# 2. 數據增強
# ============================
test_transform_model1_1 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_1, IMAGE_SIZE_1)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model1_2 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_1, IMAGE_SIZE_1)),
    transforms.RandomHorizontalFlip(p=1.0),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model1_3 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_1, IMAGE_SIZE_1)),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

test_transform_model2_1 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_2, IMAGE_SIZE_2)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model2_2 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_2, IMAGE_SIZE_2)),
    transforms.RandomHorizontalFlip(p=1.0),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model2_3 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_2, IMAGE_SIZE_2)),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

test_transform_model3_1 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_3, IMAGE_SIZE_3)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model3_2 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_3, IMAGE_SIZE_3)),
    transforms.RandomHorizontalFlip(p=1.0),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])
test_transform_model3_3 = transforms.Compose([
    transforms.Resize((IMAGE_SIZE_3, IMAGE_SIZE_3)),
    transforms.RandomRotation(degrees=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) # For eva02_enormous_patch14_clip_224 #######
])

print("done")

In [5]:
# =============================================================================
#                                分析正確率
# =============================================================================

from sklearn.metrics import confusion_matrix
MODEL_PATH = "/kaggle/input/convnextv2_large/pytorch/default/1/best.pth"  # 已訓練的模型檢查點
IMAGE_SIZE = 384  # 輸入圖片大小
model_name = "convnextv2_large"  # 使用的模型名稱
print("Loading model")

num_classes = 15  # 類別數量
model = create_model(model_name, pretrained=False, num_classes=num_classes)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device), strict=False)
model = model.to(device)
model.eval()

def evaluate_model(model, dataloader, device):
    model.eval()  # 設置為驗證模式
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return np.array(all_preds), np.array(all_labels)

# 推理並計算預測值和真實值
preds, labels = evaluate_model(model, val_loader, device)

# 計算混淆矩陣
conf_matrix = confusion_matrix(labels, preds)

# 計算每個類別的準確率
class_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# 打印每個類別的準確率
print("\n各類別正確率:")
class_accuracy_dict = {}
for class_idx, accuracy in enumerate(class_accuracy):
    class_name = reverse_label_map[class_idx]
    class_accuracy_dict[class_name] = accuracy
    print(f"類別 {class_name}: {accuracy:.2%}")

Loading model


  model.load_state_dict(torch.load(MODEL_PATH, map_location=device), strict=False)



各類別正確率:
類別 calling: 95.52%
類別 clapping: 95.59%
類別 cycling: 98.48%
類別 dancing: 92.42%
類別 drinking: 92.75%
類別 eating: 97.10%
類別 fighting: 92.31%
類別 hugging: 97.10%
類別 laughing: 86.36%
類別 listening_to_music: 88.06%
類別 running: 97.06%
類別 sitting: 86.76%
類別 sleeping: 100.00%
類別 texting: 86.36%
類別 using_laptop: 95.59%
