In [1]:
from google.colab import drive

drive.mount('/content/gdrive')
!unzip -q "/content/gdrive/MyDrive/open.zip"

Mounted at /content/gdrive


데이터 로드 및 데이터프레임 생성

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.isotonic import IsotonicRegression
from tensorflow.keras.applications import Xception, ConvNeXtBase
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.utils import Sequence
import albumentations as A

def Create_DataFrame(train_dir, test_dir, test_csv):
    train_paths, train_labels = [], []
    for root, _, files in os.walk(train_dir):
        for f in files:
            train_paths.append(os.path.join(root,f))
            train_labels.append(os.path.basename(root))
    train_df = pd.DataFrame({'img_path':train_paths, 'label':train_labels})

    test_paths = []
    for root, _, files in os.walk(test_dir):
        for f in sorted(files):
            test_paths.append(os.path.join(root,f))
    test_df = pd.read_csv(test_csv)
    test_df['img_path'] = test_paths
    return train_df, test_df

train_df, test_df = Create_DataFrame(
    '/content/open/train',
    '/content/open/test',
    '/content/open/test.csv'
)
class_names = sorted(train_df['label'].unique())
n_classes = len(class_names)
label2idx = {c:i for i,c in enumerate(class_names)}
y = train_df['label'].map(label2idx).values

시퀀스 데이터셋

In [None]:

class Stone_Dataset(Sequence):
    def __init__(self, paths, labels, batch_size=64, augmentor=None):
        self.paths, self.labels = paths, labels
        self.bs = batch_size
        self.aug = augmentor
    def __len__(self): return int(np.ceil(len(self.paths)/self.bs))
    def __getitem__(self, idx):
        batch = self.paths[idx*self.bs:(idx+1)*self.bs]
        X = np.zeros((len(batch),224,224,3),dtype=np.float32)
        for i,p in enumerate(batch):
            img = cv2.cvtColor(cv2.imread(p),cv2.COLOR_BGR2RGB)
            if self.aug: img = self.aug(image=img)['image']
            img = cv2.resize(img,(224,224)) / 255.
            X[i]=img
        if self.labels is None: return X
        lbl = self.labels[idx*self.bs:(idx+1)*self.bs]
        y_onehot = np.eye(n_classes)[lbl]
        return X, y_onehot


모델 빌더

In [None]:

def build(backbone):
    inp = Input((224,224,3))
    base = backbone(include_top=False, weights='imagenet', input_tensor=inp)
    x = GlobalAveragePooling2D()(base.output)
    out = Dense(n_classes, activation='softmax')(x)
    return Model(inp,out)

# 미리 학습된 가중치 경로
xp_w = '/content/gdrive/MyDrive/Xception10-0.31.weights.h5'
cn_w = '/content/gdrive/MyDrive/ConvNeXtBase06-0.27.weights.h5'


미리 학습된 모델가중치의 Kfold

In [None]:

NF = 5
skf = StratifiedKFold(n_splits=NF, shuffle=True, random_state=42)

oof_x = np.zeros((len(train_df), n_classes), dtype=np.float32)
oof_c = np.zeros_like(oof_x)

for fold,(tr_idx,va_idx) in enumerate(skf.split(train_df, y),1):
    print(f"\n-- Fold {fold}")
    # 데이터
    tr_paths = train_df['img_path'].iloc[tr_idx].values
    va_paths = train_df['img_path'].iloc[va_idx].values
    tr_lbl   = y[tr_idx]
    va_lbl   = y[va_idx]

    tr_ds = Stone_Dataset(tr_paths, tr_lbl, batch_size=64, augmentor=A.HorizontalFlip(p=0.5))
    va_ds = Stone_Dataset(va_paths, va_lbl, batch_size=64, augmentor=None)

    # Xception
    m_x = build(Xception);  m_x.load_weights(xp_w)
    preds_va_x = m_x.predict(va_ds, verbose=0)
    # ConvNeXtBase
    m_c = build(ConvNeXtBase); m_c.load_weights(cn_w)
    preds_va_c = m_c.predict(va_ds, verbose=0)

    oof_x[va_idx] = preds_va_x
    oof_c[va_idx] = preds_va_c


앙상블 하기위해 최적의 가중치 검색

In [None]:
best_w, best_f1 = 0, 0
for w in np.linspace(0,1,101):
    ens = w*oof_x + (1-w)*oof_c
    pred = ens.argmax(axis=1)
    f1 = f1_score(y, pred, average='macro')
    if f1>best_f1:
        best_f1, best_w = f1, w

print(f"\n▶ Best macro-F1 on OOF: {best_f1:.4f} (w_x={best_w:.2f}, w_c={1-best_w:.2f})")


OOF 앙상블 확률에 대해 클래스별 Isotonic 캘리브레이션


In [None]:

oof_ens = best_w*oof_x + (1-best_w)*oof_c
iso_models = []
# for each class j, fit iso on [p_j] vs [y==j]
for j in range(n_classes):
    ir = IsotonicRegression(out_of_bounds='clip')
    ir.fit(oof_ens[:,j], (y==j).astype(int))
    iso_models.append(ir)

탐색한 가중치 적용하여 앙상블

In [None]:
test_ds = Stone_Dataset(test_df['img_path'].values, None, batch_size=64, augmentor=None)

# fold-averaged test preds
preds_x = np.zeros((len(test_df), n_classes),dtype=np.float32)
preds_c = np.zeros_like(preds_x)
for fold in range(NF):
    # 그냥 reuse same loaded models (weights 고정)
    pass
# (사실 모델마다 한 번만 predict 해도 되므로:)
preds_x = build(Xception).load_weights(xp_w) or build(Xception).predict(test_ds,verbose=0)
# 위 한 줄 대신:
m_x = build(Xception); m_x.load_weights(xp_w)
m_c = build(ConvNeXtBase); m_c.load_weights(cn_w)
preds_x = m_x.predict(test_ds, verbose=0)
preds_c = m_c.predict(test_ds, verbose=0)

ens_test = best_w*preds_x + (1-best_w)*preds_c

보정적용(근데 보정 안해도 될거같음..)

In [None]:
# 캘리브레이션 적용
calib = np.zeros_like(ens_test)
for j,ir in enumerate(iso_models):
    calib[:,j] = ir.predict(ens_test[:,j])
# row-wise 정규화
calib = calib / calib.sum(axis=1, keepdims=True)

# 최종 레이블
final_idx = calib.argmax(axis=1)
final_labels = [class_names[i] for i in final_idx]

sub = pd.read_csv('/content/open/sample_submission.csv')
sub['rock_type'] = final_labels
sub.to_csv('submit_cv_blend_calibrated.csv', index=False)

print("✅ Done: submit_cv_blend_calibrated.csv")


-- Fold 1


  self._warn_if_super_not_called()



-- Fold 2

-- Fold 3

-- Fold 4

-- Fold 5

▶ Best macro-F1 on OOF: 0.9768 (w_x=0.48, w_c=0.52)


  self._warn_if_super_not_called()


✅ Done: submit_cv_blend_calibrated.csv
