<a href="https://colab.research.google.com/github/jh941213/dacon_DoBaeClassification/blob/main/EfficientNet_v2%EB%8F%84%EB%B0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import re
import glob
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore') 

In [None]:
!curl -L "https://app.roboflow.com/ds/bF2K1llYOI?key=ug9B0e6ORi" > roboflow.zip; unzip roboflow.zip; rm roboflow.zip

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [None]:
CFG = {
    'IMG_SIZE':300,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':30,
    'SEED':41
}

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing

In [None]:
all_img_list = glob.glob('/content/train/*/*')

In [None]:
all_img_list_2 = glob.glob('/content/valid/*/*')

In [None]:
all_img_list = all_img_list + all_img_list_2

In [None]:
df = pd.DataFrame(columns=['img_path', 'label'])
df['img_path'] = all_img_list
df['label'] = df['img_path'].apply(lambda x : str(x).split('/')[3])

In [None]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.1, stratify=df['label'], random_state=CFG['SEED'])

## Label-Encoding

In [None]:
le = preprocessing.LabelEncoder()
train['label'] = le.fit_transform(train['label'])
val['label'] = le.transform(val['label'])

## CustomDataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, transforms=None):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.transforms = transforms
        self.feature = []

        for img_path in tqdm(self.img_path_list):
            image = cv2.imread(img_path)
            if self.transforms is not None:
                image = self.transforms(image=image)['image']
            self.feature.append(image)

    def __getitem__(self, index):
        if self.label_list is not None:
            return self.feature[index], self.label_list[index]
        else:
            return self.feature[index]
        
    def __len__(self):
        return len(self.img_path_list)

In [None]:
train_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transform = A.Compose([
                            A.Resize(CFG['IMG_SIZE'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [None]:
train_dataset = CustomDataset(train['img_path'].values, train['label'].values, train_transform)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

val_dataset = CustomDataset(val['img_path'].values, val['label'].values, test_transform)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

  0%|          | 0/7315 [00:00<?, ?it/s]

  0%|          | 0/813 [00:00<?, ?it/s]

In [None]:
import gc
gc.collect()

17464

## Model Define

In [None]:
!pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=len(le.classes_)):
        super(BaseModel, self).__init__()
        self.backbone = models.efficientnet_b3(pretrained=True)
        self.classifier = nn.Linear(1000, num_classes)
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.classifier(x)
        return x

## Train

In [None]:

def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for imgs, labels in tqdm(iter(train_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(imgs)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val Weighted F1 Score : [{_val_score:.5f}]')
       
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_score < _val_score:
            best_score = _val_score
            best_model = model
    
    return best_model
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, true_labels = [], []

    with torch.no_grad():
        for imgs, labels in tqdm(iter(val_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            pred = model(imgs)
            
            loss = criterion(pred, labels)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += labels.detach().cpu().numpy().tolist()
            
            val_loss.append(loss.item())
        
        _val_loss = np.mean(val_loss)
        _val_score = f1_score(true_labels, preds, average='weighted')
    
    return _val_loss, _val_score
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)
    

  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.87400] Val Loss : [0.36675] Val Weighted F1 Score : [0.87558]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.22089] Val Loss : [0.24692] Val Weighted F1 Score : [0.94307]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.09174] Val Loss : [0.20275] Val Weighted F1 Score : [0.95552]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.08467] Val Loss : [0.32473] Val Weighted F1 Score : [0.92919]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.06656] Val Loss : [0.32027] Val Weighted F1 Score : [0.94110]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.05870] Val Loss : [0.43631] Val Weighted F1 Score : [0.94250]
Epoch 00006: reducing learning rate of group 0 to 1.5000e-04.


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.02660] Val Loss : [0.25644] Val Weighted F1 Score : [0.95893]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.00733] Val Loss : [0.28222] Val Weighted F1 Score : [0.95536]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.00772] Val Loss : [0.24489] Val Weighted F1 Score : [0.96040]


  0%|          | 0/244 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.00452] Val Loss : [0.24970] Val Weighted F1 Score : [0.96404]


In [None]:
infer_model

## Run!!

In [None]:
from sklearn.model_selection import StratifiedKFold

def run_k_fold(n_splits=5, shuffle=True, random_state=CFG['SEED']):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

for fold, (train_idx, val_idx) in enumerate(skf.split(df['img_path'], df['label'])):
    print(f"{'='*40} Fold : {fold} {'='*40}")
    train = df.iloc[train_idx].reset_index(drop=True)
    val = df.iloc[val_idx].reset_index(drop=True)
    
    ## Label-Encoding
    le = preprocessing.LabelEncoder()
    train['label'] = le.fit_transform(train['label'])
    val['label'] = le.transform(val['label'])

    ## Dataloader
    train_dataset = CustomDataset(train['img_path'].values, train['label'].values, train_transform)
    train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

    val_dataset = CustomDataset(val['img_path'].values, val['label'].values, test_transform)
    val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
    
    ## Model Define
    model = BaseModel()
    model.eval()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=CFG["LEARNING_RATE"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, threshold_mode='abs', min_lr=1e-8, verbose=True)

    infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)
    torch.save(infer_model.state_dict(), f'fold_{fold}_model.pt')

IndentationError: ignored

## Inference

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
test = pd.read_csv('/content/drive/MyDrive/testfile/test.csv')

In [None]:
test['img_path'] = test['img_path'].apply(lambda x: x.replace('./test', '/content/drive/MyDrive/testfile/test'))


In [None]:
test

In [None]:
test_dataset = CustomDataset(test['img_path'].values, None, test_transform)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

  0%|          | 0/792 [00:00<?, ?it/s]

In [None]:
def inference(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for imgs in tqdm(iter(test_loader)):
            imgs = imgs.float().to(device)
            
            pred = model(imgs)
            
            preds += pred.argmax(1).detach().cpu().numpy().tolist()
    
    preds = le.inverse_transform(preds)
    return preds

In [None]:
infer_model

In [None]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/27 [00:00<?, ?it/s]

## Submission

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/testfile/sample_submission.csv')

In [None]:
submit['label'] = preds

In [None]:
submit

Unnamed: 0,id,label
0,TEST_000,18
1,TEST_001,10
2,TEST_002,18
3,TEST_003,7
4,TEST_004,10
...,...,...
787,TEST_787,3
788,TEST_788,9
789,TEST_789,10
790,TEST_790,18


In [None]:
submit.to_csv('/content/effb3.csv', index=False)

In [None]:
import csv

# 딕셔너리 생성
label_dict = {
    0: '가구수정',
    1: '걸레받이수정',
    2: '곰팡이',
    3: '꼬임',
    4: '녹오염',
    5: '들뜸',
    6: '면불량',
    7: '몰딩수정',
    8: '반점',
    9: '석고수정',
    10: '오염',
    11: '오타공',
    12: '울음',
    13: '이음부불량',
    14: '창틀문틀수정',
    15: '터짐',
    16: '틈새과다',
    17: '피스',
    18: '훼손'
}

# CSV 파일 열기
with open('/content/effb3.csv', mode='r') as file:
    reader = csv.reader(file)
    submission = list(reader)

# 레이블 번호를 한글로 변경
for i in range(1, len(submission)):
    submission[i][1] = label_dict[int(submission[i][1])]

# 새로운 CSV 파일로 저장
with open('/content/effb33.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(submission)

In [None]:
submit['label'] = submit['label'].apply(lambda x: label_dict.get(x, x))

In [None]:
submit