# File Structure
```
┖ dataset - 데이터셋 파일 위치
    ┖ train.h5
    ┖ test.h5
    ┖ train.csv
    ┖ sample_submission.csv
┖ Final_code.ipynb - 전체 코드 실행 파일
```

# Load library and variable

In [1]:
#사용 모듈

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import os
import time
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, FastICA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import h5py # .h5 파일을 읽기 위한 패키지
import cv2
import glob
from os.path import join as opj

In [2]:
#변수 준비

debug = False
CFG = {
    'CLF_LR': 0.0001,
    'BATCH_SIZE': 64,
    'SEED': 42,
    'MODEL_NAME': 'densenet161',
    'MODEL_NAME_2': 'tf_efficientnet_b4_ns',
    'MODEL_NAME_3': 'xception',
    'EPOCHS': 40,
}

# 3d->2d using PCA
onlygoodman님의 접근방식을 참고했습니다.

https://dacon.io/competitions/official/235951/codeshare/6476

In [3]:
train_all = h5py.File('./dataset/train.h5', 'r')
test_all = h5py.File('./dataset/test.h5', 'r')

In [4]:
train_dir = f'./dataset/trainimage_224NPCA/'
test_dir = f'./dataset/testimage_224NPCA/'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [None]:
#train data 준비, 224

count = 0
failcount = 0 

#pca = SparsePCA(n_components=2), trainimage_224SPCA
pca = PCA(n_components=2)
fig = plt.figure(figsize=(8, 8))

for i in tqdm(range(50000)):
    if True:
        testdata = np.array(train_all[str(i)])
        
        #3차원 -> 2차원으로 축소
        df_pca = pca.fit_transform(testdata)
        x = df_pca[:, 0]
        y = df_pca[:, 1]
        plt.axis('off')
        #산점도의 점크기 -> s=1로 하면 사진이 너무 두꺼워짐
        plt.scatter(x, y, s=0.2, c="black")
        
        #224*224 size로 저장
        plt.savefig('./dataset/trainimage_224NPCA/train_image{:0>5}.jpg'.format(i), dpi=28)
        plt.cla()
        count+=1
        
    else:
        failcount += 1

In [None]:
#test data 준비, 224

count = 0
failcount = 0 

#pca = SparsePCA(n_components=2), testimage_224SPCA
pca = PCA(n_components=2)
fig = plt.figure(figsize=(8, 8))
    
for i in tqdm(range(40000)):
    if True:
        testdata = np.array(test_all[str(i+50000)])
        
        #3차원 -> 2차원으로 축소
        df_pca = pca.fit_transform(testdata)
        x = df_pca[:, 0]
        y = df_pca[:, 1]
        plt.axis('off')
        #산점도의 점크기 -> s=1로 하면 사진이 너무 두꺼워짐
        plt.scatter(x, y, s=0.2, c="black")
        
        #224*224 size로 저장
        plt.savefig('./dataset/testimage_224NPCA/test_image{:0>5}.jpg'.format(i), dpi=28)
        plt.cla()
        count+=1
        
    else:
        failcount += 1   

# model

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
import timm
import random

import warnings
warnings.filterwarnings(action='ignore') 

In [8]:
class Classifier_Dataset(Dataset):
  def __init__(self, df):
    self.df = df
  def __len__(self):
    return len(self.df)
  def __getitem__(self, idx):
    img = cv2.imread(self.df.iloc[idx, 0], cv2.IMREAD_GRAYSCALE)
    img = img / 255
    img = torch.Tensor(img)[None, :]

    label = self.df.iloc[idx, 1]
    label = torch.eye(10)[label]
    return img, label

In [9]:
class Case_Classifier(nn.Module):
    def __init__(self, name):
        super(Case_Classifier, self).__init__()
        self.model = timm.create_model(name, pretrained = True, num_classes = 10, in_chans=1)
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        x = self.model(x) #1층
        return self.softmax(x)

In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# train

In [12]:
train_paths = sorted(glob.glob('./dataset/trainimage_224NPCA/*.jpg'))

train_df = pd.DataFrame({'train_path':train_paths})
label_df = pd.read_csv('./dataset/train.csv')

train_label = []
for i in tqdm(range(len(train_df))):
    train_label.append(label_df[label_df['ID'] == int(train_df.iloc[i][0][-9:-4])]['label'].iloc[0])

train_df['label'] = train_label

print(train_df.shape)
train_df.head()

  0%|          | 0/50000 [00:00<?, ?it/s]

(50000, 2)


Unnamed: 0,train_path,label
0,./dataset/trainimage_224NPCA\train_image00000.jpg,5
1,./dataset/trainimage_224NPCA\train_image00001.jpg,0
2,./dataset/trainimage_224NPCA\train_image00002.jpg,4
3,./dataset/trainimage_224NPCA\train_image00003.jpg,1
4,./dataset/trainimage_224NPCA\train_image00004.jpg,9


In [13]:
from collections import Counter
cnt = Counter(train_label)
print(cnt)

Counter({1: 5678, 7: 5175, 3: 5101, 9: 4988, 2: 4968, 6: 4951, 0: 4932, 4: 4859, 8: 4842, 5: 4506})


## Train Densenet161

In [None]:
from sklearn.model_selection import StratifiedKFold

model_dir = f'./model/'+CFG['MODEL_NAME']
os.makedirs(model_dir, exist_ok=True)
print(model_dir)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=CFG['SEED'])
for i, [train_idx, val_idx] in enumerate(kf.split(train_df, train_df['label'])):
    if i == 3:
        break
    print('########## %dth train ##########' %(i))
    
    seed_everything(CFG['SEED']) # Seed 고정
    
    df_train = train_df.iloc[train_idx]
    df_val = train_df.iloc[val_idx]

    cls_set = Classifier_Dataset(df_train)
    cls_val_set = Classifier_Dataset(df_val)
    cls_loader = DataLoader(cls_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    cls_val_loader = DataLoader(cls_val_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    classifier = Case_Classifier(CFG['MODEL_NAME'])
    classifier.to(device)

    optimizer = torch.optim.AdamW(params=classifier.parameters(), lr=CFG['CLF_LR'])
    criterion = nn.CrossEntropyLoss()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5, factor=0.5)
    
    best_acc = 0
    np.set_printoptions(precision=6, suppress=True)
    for epoch in range(CFG['EPOCHS']):
      train_losses = []
      val_losses = []
      accuracy = 0
      
      classifier.train()
      for img, label in tqdm(cls_loader):
        img = img.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        pred = classifier(img)
        loss = criterion(label, pred)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
      
      classifier.eval()

      with torch.no_grad():
        for img, label in tqdm(cls_val_loader):
          img = img.to(device)
          label = label.to(device)

          pred = classifier(img)
          loss = criterion(label, pred)
          val_losses.append(loss.item())

          label = label.argmax(dim=1)
          pred = pred.argmax(dim=1)
          acc = (label==pred).count_nonzero()
          accuracy += acc.item() / len(cls_val_set)
        
      if best_acc < accuracy:
        torch.save(classifier.state_dict(), model_dir+f'/cnn_classifier_{i}.pth')
        print('##########Model Saved!##########')
        best_acc = accuracy


      if scheduler is not None:
        scheduler.step(accuracy)

      train_losses = np.mean(train_losses)
      val_losses = np.mean(val_losses)

      print(f'[EPOCH:{epoch+1}/{CFG["EPOCHS"]}] [Train Loss:{train_losses}] [Val Loss:{val_losses}] [Val Accuracy:{accuracy}]')

## Train efficientnet_b4

In [None]:
from sklearn.model_selection import StratifiedKFold

model_dir = f'./model/'+CFG['MODEL_NAME_2']
os.makedirs(model_dir, exist_ok=True)
print(model_dir)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=CFG['SEED'])
for i, [train_idx, val_idx] in enumerate(kf.split(train_df, train_df['label'])):
    if i < 3:
        continue
    if i == 6:
        break
    print('########## %dth train ##########' %(i))
    
    seed_everything(CFG['SEED']) # Seed 고정
    
    df_train = train_df.iloc[train_idx]
    df_val = train_df.iloc[val_idx]

    cls_set = Classifier_Dataset(df_train)
    cls_val_set = Classifier_Dataset(df_val)
    cls_loader = DataLoader(cls_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    cls_val_loader = DataLoader(cls_val_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    classifier = Case_Classifier(CFG['MODEL_NAME_2'])
    classifier.to(device)

    optimizer = torch.optim.AdamW(params=classifier.parameters(), lr=CFG['CLF_LR'])
    criterion = nn.CrossEntropyLoss()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5, factor=0.5)
    
    best_acc = 0
    np.set_printoptions(precision=6, suppress=True)
    for epoch in range(CFG['EPOCHS']):
      train_losses = []
      val_losses = []
      accuracy = 0
      
      classifier.train()
      for img, label in tqdm(cls_loader):
        img = img.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        pred = classifier(img)
        loss = criterion(label, pred)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
      
      classifier.eval()

      with torch.no_grad():
        for img, label in tqdm(cls_val_loader):
          img = img.to(device)
          label = label.to(device)

          pred = classifier(img)
          loss = criterion(label, pred)
          val_losses.append(loss.item())

          label = label.argmax(dim=1)
          pred = pred.argmax(dim=1)
          acc = (label==pred).count_nonzero()
          accuracy += acc.item() / len(cls_val_set)
        
      if best_acc < accuracy:
        torch.save(classifier.state_dict(), model_dir+f'/cnn_classifier_{i}.pth')
        print('##########Model Saved!##########')
        best_acc = accuracy


      if scheduler is not None:
        scheduler.step(accuracy)

      train_losses = np.mean(train_losses)
      val_losses = np.mean(val_losses)

      print(f'[EPOCH:{epoch+1}/{CFG["EPOCHS"]}] [Train Loss:{train_losses}] [Val Loss:{val_losses}] [Val Accuracy:{accuracy}]')

## Train xception

In [None]:
from sklearn.model_selection import StratifiedKFold

model_dir = f'./model/'+CFG['MODEL_NAME_3']
os.makedirs(model_dir, exist_ok=True)
print(model_dir)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=CFG['SEED'])
for i, [train_idx, val_idx] in enumerate(kf.split(train_df, train_df['label'])):
    if i < 6:
        continue
    if i == 9:
        break
    print('########## %dth train ##########' %(i))
    
    seed_everything(CFG['SEED']) # Seed 고정
    
    df_train = train_df.iloc[train_idx]
    df_val = train_df.iloc[val_idx]

    cls_set = Classifier_Dataset(df_train)
    cls_val_set = Classifier_Dataset(df_val)
    cls_loader = DataLoader(cls_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    cls_val_loader = DataLoader(cls_val_set, batch_size=CFG['BATCH_SIZE'], shuffle=True)
    classifier = Case_Classifier(CFG['MODEL_NAME_3'])
    classifier.to(device)

    optimizer = torch.optim.AdamW(params=classifier.parameters(), lr=CFG['CLF_LR'])
    criterion = nn.CrossEntropyLoss()

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=5, factor=0.5)
    
    best_acc = 0
    np.set_printoptions(precision=6, suppress=True)
    for epoch in range(CFG['EPOCHS']):
      train_losses = []
      val_losses = []
      accuracy = 0
      
      classifier.train()
      for img, label in tqdm(cls_loader):
        img = img.to(device)
        label = label.to(device)

        optimizer.zero_grad()
        pred = classifier(img)
        loss = criterion(label, pred)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
      
      classifier.eval()

      with torch.no_grad():
        for img, label in tqdm(cls_val_loader):
          img = img.to(device)
          label = label.to(device)

          pred = classifier(img)
          loss = criterion(label, pred)
          val_losses.append(loss.item())

          label = label.argmax(dim=1)
          pred = pred.argmax(dim=1)
          acc = (label==pred).count_nonzero()
          accuracy += acc.item() / len(cls_val_set)
        
      if best_acc < accuracy:
        torch.save(classifier.state_dict(), model_dir+f'/cnn_classifier_{i}.pth')
        print('##########Model Saved!##########')
        best_acc = accuracy


      if scheduler is not None:
        scheduler.step(accuracy)

      train_losses = np.mean(train_losses)
      val_losses = np.mean(val_losses)

      print(f'[EPOCH:{epoch+1}/{CFG["EPOCHS"]}] [Train Loss:{train_losses}] [Val Loss:{val_losses}] [Val Accuracy:{accuracy}]')

# inference

In [52]:
class Inference_Dataset(Dataset):
  def __init__(self, df):
    self.df = df
  def __len__(self):
    return len(self.df)
  def __getitem__(self, idx):
    img1 = cv2.imread(self.df.iloc[idx, 0], cv2.IMREAD_GRAYSCALE)
    
    img1 = img1 /255.0
    
    img1 = torch.Tensor(img1)[None, :]

    return img1

In [53]:
models = []
model_dir = f'./model/'+CFG['MODEL_NAME']
model_dir_2 = f'./model/'+CFG['MODEL_NAME_2']
model_dir_3 = f'./model/'+CFG['MODEL_NAME_3']

for i in range(3):
    classifier = Case_Classifier(CFG['MODEL_NAME'])
    model_path = model_dir+f'/cnn_classifier_{i}.pth'
    classifier.load_state_dict(torch.load(model_path))
    classifier.to(device)
    classifier.eval()
    models.append(classifier)
    
for i in range(3,6):
    classifier = Case_Classifier(CFG['MODEL_NAME_2'])
    model_path = model_dir_2+f'/cnn_classifier_{i}.pth'
    classifier.load_state_dict(torch.load(model_path))
    classifier.to(device)
    classifier.eval()
    models.append(classifier)
    
for i in range(6,9):
    classifier = Case_Classifier(CFG['MODEL_NAME_3'])
    model_path = model_dir_3+f'/cnn_classifier_{i}.pth'
    classifier.load_state_dict(torch.load(model_path))
    classifier.to(device)
    classifier.eval()
    models.append(classifier)

In [54]:
print(len(models))

9


In [55]:
test_paths = sorted(glob.glob('./dataset/testimage_224NPCA/*.jpg'))

test_df = pd.DataFrame({'test_path':test_paths})
print(test_df.shape)
test_df.head()

(40000, 1)


Unnamed: 0,test_path
0,./dataset/testimage_224NPCA\test_image00000.jpg
1,./dataset/testimage_224NPCA\test_image00001.jpg
2,./dataset/testimage_224NPCA\test_image00002.jpg
3,./dataset/testimage_224NPCA\test_image00003.jpg
4,./dataset/testimage_224NPCA\test_image00004.jpg


In [56]:
test_dataset = Inference_Dataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

n = 0

result_list = []

with torch.no_grad():
    for img1 in tqdm(iter(test_loader)):
        label_pred = []
        
        img1 = img1.to(device)
        
        for i in range(len(models)):
            label_pred.append(models[i](img1))
            
        standard = label_pred[0]
        for i in range(1, len(label_pred)):
            standard = standard + label_pred[i]
        
        for i in range(len(standard)):
            label = standard[i].argmax(dim=0).item()
            result_list.append(label)

  0%|          | 0/625 [00:00<?, ?it/s]

In [59]:
from collections import Counter
cnt = Counter(result_list)
print(cnt)

Counter({1: 4450, 7: 4170, 2: 4124, 3: 4070, 8: 4014, 4: 3986, 0: 3970, 9: 3952, 6: 3708, 5: 3556})


In [57]:
submission_df = pd.read_csv('./dataset/sample_submission.csv')
submission_df['label'] = result_list
submission_df.head()

Unnamed: 0,ID,label
0,50000,7
1,50001,2
2,50002,2
3,50003,4
4,50004,9


In [58]:
submission_df.to_csv('./submit_5.csv', index=False)