# IMPORT LIBRARIES

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GroupKFold

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler


import lightgbm as lgb
import catboost as cb
import xgboost as xgb

import optuna
from optuna.samplers import TPESampler

#downsampling techniques
# they took long time, so we use RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import NearMiss, TomekLinks
from sklearn.impute import SimpleImputer
from dataclasses import dataclass

import time
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, VarianceThreshold


In [2]:
@dataclass
class CFG_GLOBAL:
    # 見落とし注意！
    num_folds = 5
    train_folds = [0, 1, 2, 3, 4]

cfg_global = CFG_GLOBAL()

# Image model 20240827234748

In [12]:
import os
from pathlib import Path

import numpy as np
import re
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score

from pathlib import Path
from dataclasses import dataclass


@dataclass
class CFG:
    # 見落とし注意！
    run_id = "20240827234748"
    model_name = "convnextv2_nano.fcmae_ft_in22k_in1k"
    model_path = Path(f"/kaggle/input/output-{run_id}")
    img_size = 224 #384 or 224
    folds = [0,1,2,3,4]

    # auto
    oof_path = model_path / "oof_predictions.csv"

    # --- desctiption
#         "cv": 0.1599389,
#         "lb": ,
#         "version": "",
#         "config": "convnextv2_nano.fcmae_ft_in22k_in1k"",
#         "date": 20240827234748,
#         "desc": "convnextv2_nano.fcmae_ft_in22k_in1k + lr0.0001 + warmup + train1:1-val1:10 + upsample-2 + AugumentMore + Dropout + CustomHead",
    # ---------------

cfg = CFG()

test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv')
subm_path  = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv')

id_col = 'isic_id'

def read_data(path, cfg):
    return (
        pl.read_csv(path)
        .to_pandas()
        .set_index(id_col)
    )

df_test = read_data(test_path, cfg)
df_subm = pd.read_csv(subm_path, index_col=id_col)

# === ImageNet inference code

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
import timm
from torchvision import transforms
from PIL import Image
import io
import albumentations as A
from albumentations.pytorch import ToTensorV2


class ISICDataset(Dataset):
    def __init__(self, hdf5_file, isic_ids, targets=None, transform=None):
        self.hdf5_file = h5py.File(hdf5_file, 'r')  # Keep file open
        self.isic_ids = isic_ids
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, idx):
        img_bytes = self.hdf5_file[self.isic_ids[idx]][()]
        img = Image.open(io.BytesIO(img_bytes))
        img = np.array(img)

        if self.transform:
            transformed = self.transform(image=img)
            img = transformed['image']

        target = self.targets[idx] if self.targets is not None else torch.tensor(-1)
        return img, target

    def __del__(self):
        self.hdf5_file.close()  # Ensure file is closed when object is destroyed

# Define the albumentations transformation
base_transform = A.Compose([
    A.Resize(cfg.img_size, cfg.img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)

    def __repr__(self):
        return (
            self.__class__.__name__
            + "("
            + "p="
            + "{:.4f}".format(self.p.data.tolist()[0])
            + ", "
            + "eps="
            + str(self.eps)
            + ")"
        )

class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, pretrained=False, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model_name = model_name
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)
        self.model_org = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)
        self.model_org.classifier = nn.Identity()
        self.pooling = GeM()

        if "eva02" in self.model_name:
            in_features = self.model.head.in_features
            self.model.head = nn.Identity()
        elif "efficientnetv2" in self.model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            if self.pooling:  # My custom pooling
                self.model.global_pool = nn.Identity()
        elif "convnextv2" in self.model_name:
            print("This is convnextv2 family")
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.head = nn.Sequential(
                nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                nn.Flatten(),  # フラット化
                nn.Linear(640, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
                nn.Dropout(0.5),  # ドロップアウト
                nn.Linear(256, num_classes),  # 出力層（2クラス分類）
            )
        self.linear = nn.Linear(in_features, num_classes)
        self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])

    def forward(self, images):
        features = self.model(images)

        if "efficientnetv2" in self.model_name:
            # Custom poolingがある場合
            if self.pooling:
                features = self.pooling(features).flatten(1)

        if "convnextv2" in self.model_name:
            output = self.head(features)
        else:
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    output = self.linear(dropout(features))
                else:
                    output += self.linear(dropout(features))
            output /= len(self.dropouts)

        return output.squeeze()



def get_latest_epoch_file(folder_path, target_fold):
    # 正規表現パターン：fold_X_epoch_Y_score_Z.pth の形式に一致
    pattern = re.compile(r"fold_(\d+)_epoch_(\d+)_score_(\d+\.\d+)\.pth") # 新パターン
#     pattern = re.compile(r"model_fold_(\d+)_epoch_(\d+)\.pth") # 旧パターン

    max_epoch = -1
    latest_file = None

    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            fold, epoch, score = match.groups() # 新パターン
#             fold, epoch = match.groups() # 旧パターン
            fold = int(fold)
            epoch = int(epoch)

            if fold == target_fold and epoch > max_epoch:
                max_epoch = epoch
                latest_file = filename

    if latest_file:
        return os.path.join(folder_path, latest_file)
    else:
        return None

def load_models(folds, device):
    models = []
    for fold in folds:
        model = ISICModel(cfg.model_name)
        model.to(device)
        model_w_path = get_latest_epoch_file(cfg.model_path, fold)
        model.load_state_dict(torch.load(model_w_path, map_location=device))
        model.eval()
        models.append(model)
    return models


@torch.no_grad()  # Apply no_grad to the entire function
def ensemble_predict(models, test_loader, device):
    all_predictions = []
    for inputs, _ in tqdm(test_loader, desc="Predicting"):
        inputs = inputs.to(device)
#         fold_predictions = torch.stack([model(inputs).softmax(dim=1)[:, 1] for model in models])
        fold_predictions = torch.stack([model(inputs) for model in models])
        avg_predictions = fold_predictions.mean(dim=0)
        all_predictions.extend(avg_predictions.cpu().numpy())
    return all_predictions


# === Do ImageNet inference on test data / merge df_test
TEST_HDF5_FILE_PATH = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

# Set up CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# folds to use for pred
folds = cfg.folds

models = load_models(folds, device)

# Prepare your test dataset
test_dataset = ISICDataset(
    hdf5_file=TEST_HDF5_FILE_PATH,
    isic_ids=df_test.index.values,  #minor change here from
    transform=base_transform,
)

# Create test data loader
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

# Run predictions
predictions = ensemble_predict(models, test_loader, device)

# Create a new DataFrame with predictions
temp_df = pd.DataFrame({"image_predict": predictions}, index=df_test.index)

# Join the predictions to df_test
df_test = df_test.join(temp_df)

df_subm["target"] = df_test["image_predict"]

df_subm.to_csv(f'submission_{cfg.run_id}.csv')
df_subm.head()

Using device: cuda
This is convnextv2 family
This is convnextv2 family
This is convnextv2 family
This is convnextv2 family
This is convnextv2 family


Predicting: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,-0.501967
ISIC_0015729,-4.002757
ISIC_0015740,-3.307847


# Image model 20240830205516

In [13]:
import os
from pathlib import Path

import numpy as np
import re
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score

from pathlib import Path
from dataclasses import dataclass


@dataclass
class CFG:
    # 見落とし注意！
    run_id = "20240830205516"
    model_name = "vit_tiny_patch16_224.augreg_in21k_ft_in1k"
    model_path = Path(f"/kaggle/input/output-{run_id}")
    img_size = 224 #384 or 224
    folds = [0,1,2,3,4]

    # auto
    oof_path = model_path / "oof_predictions.csv"

    # --- desctiption
        # "cv": 0.1612504,
        # "lb": ,
        # "version": "",
        # "config": "vit_tiny_patch16_224.augreg_in21k_ft_in1k",
        # "date": 20240830205516,
        # "desc": "lr1e-4 + warmup + train1:1-val1:10 + upsample-2 + AugumentMore + Dropoutx1 + CustomHead + weight_decay1e-3",
    # ---------------

cfg = CFG()

test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv')
subm_path  = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv')

id_col = 'isic_id'

def read_data(path, cfg):
    return (
        pl.read_csv(path)
        .to_pandas()
        .set_index(id_col)
    )

df_test = read_data(test_path, cfg)
df_subm = pd.read_csv(subm_path, index_col=id_col)

# === ImageNet inference code

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
import timm
from torchvision import transforms
from PIL import Image
import io
import albumentations as A
from albumentations.pytorch import ToTensorV2


class ISICDataset(Dataset):
    def __init__(self, hdf5_file, isic_ids, targets=None, transform=None):
        self.hdf5_file = h5py.File(hdf5_file, 'r')  # Keep file open
        self.isic_ids = isic_ids
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, idx):
        img_bytes = self.hdf5_file[self.isic_ids[idx]][()]
        img = Image.open(io.BytesIO(img_bytes))
        img = np.array(img)

        if self.transform:
            transformed = self.transform(image=img)
            img = transformed['image']

        target = self.targets[idx] if self.targets is not None else torch.tensor(-1)
        return img, target

    def __del__(self):
        self.hdf5_file.close()  # Ensure file is closed when object is destroyed

# Define the albumentations transformation
base_transform = A.Compose([
    A.Resize(cfg.img_size, cfg.img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)

    def __repr__(self):
        return (
            self.__class__.__name__
            + "("
            + "p="
            + "{:.4f}".format(self.p.data.tolist()[0])
            + ", "
            + "eps="
            + str(self.eps)
            + ")"
        )

class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, pretrained=False, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model_name = model_name
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        if "eva02" in self.model_name:
            in_features = self.model.head.in_features
            self.model.head = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "efficientnetv2" in self.model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            self.pooling = GeM()
            if self.pooling:  # My custom pooling
                self.model.global_pool = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "convnextv2" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.head = nn.Sequential(
                nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
                nn.Dropout(0.5),  # ドロップアウト
                nn.Linear(256, num_classes),  # 出力層（2クラス分類）
            )
        elif "swinv2" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                # nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                GeM(),
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(256, num_classes)  # 出力層（2クラス分類）
        elif "vit_tiny" in self.model_name:
            in_features = self.model.head.in_features
            self.model.norm = nn.Identity()
            self.model.fc_norm = nn.Identity()
            self.model.head_drop = nn.Identity()
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 64),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(1)])  # 5つのDropout
            self.classifier = nn.Linear(64, num_classes)  # 出力層（2クラス分類）

    def forward(self, images):
        features = self.model(images)

        if any(
            [
                "efficientnetv2" in self.model_name,
                "eva02" in self.model_name,
            ]
        ):
            # Custom poolingがある場合
            if self.pooling:
                features = self.pooling(features).flatten(1)
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    output = self.linear(dropout(features))
                else:
                    output += self.linear(dropout(features))
            output /= len(self.dropouts)

        if "convnextv2" in self.model_name:
            output = self.head(features)

        if "swinv2" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "vit_tiny" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        return output.squeeze()




def get_latest_epoch_file(folder_path, target_fold):
    # 正規表現パターン：fold_X_epoch_Y_score_Z.pth の形式に一致
    pattern = re.compile(r"fold_(\d+)_epoch_(\d+)_score_(\d+\.\d+)\.pth") # 新パターン
#     pattern = re.compile(r"model_fold_(\d+)_epoch_(\d+)\.pth") # 旧パターン

    max_epoch = -1
    latest_file = None

    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            fold, epoch, score = match.groups() # 新パターン
#             fold, epoch = match.groups() # 旧パターン
            fold = int(fold)
            epoch = int(epoch)

            if fold == target_fold and epoch > max_epoch:
                max_epoch = epoch
                latest_file = filename

    if latest_file:
        return os.path.join(folder_path, latest_file)
    else:
        return None

def load_models(folds, device):
    models = []
    for fold in folds:
        model = ISICModel(cfg.model_name)
        model.to(device)
        model_w_path = get_latest_epoch_file(cfg.model_path, fold)
        model.load_state_dict(torch.load(model_w_path, map_location=device))
        model.eval()
        models.append(model)
    return models


@torch.no_grad()  # Apply no_grad to the entire function
def ensemble_predict(models, test_loader, device):
    all_predictions = []
    for inputs, _ in tqdm(test_loader, desc="Predicting"):
        inputs = inputs.to(device)
#         fold_predictions = torch.stack([model(inputs).softmax(dim=1)[:, 1] for model in models])
        fold_predictions = torch.stack([model(inputs) for model in models])
        avg_predictions = fold_predictions.mean(dim=0)
        all_predictions.extend(avg_predictions.cpu().numpy())
    return all_predictions


# === Do ImageNet inference on test data / merge df_test
TEST_HDF5_FILE_PATH = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

# Set up CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# folds to use for pred
folds = cfg.folds

models = load_models(folds, device)

# Prepare your test dataset
test_dataset = ISICDataset(
    hdf5_file=TEST_HDF5_FILE_PATH,
    isic_ids=df_test.index.values,  #minor change here from
    transform=base_transform,
)

# Create test data loader
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

# Run predictions
predictions = ensemble_predict(models, test_loader, device)

# Create a new DataFrame with predictions
temp_df = pd.DataFrame({"image_predict": predictions}, index=df_test.index)

# Join the predictions to df_test
df_test = df_test.join(temp_df)

df_subm["target"] = df_test["image_predict"]

df_subm.to_csv(f'submission_{cfg.run_id}.csv')
df_subm.head()

Using device: cuda


Predicting: 100%|██████████| 1/1 [00:00<00:00,  4.64it/s]


Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,-2.714859
ISIC_0015729,-5.223221
ISIC_0015740,-4.123434


# Image model 20240831025049

In [14]:
import os
from pathlib import Path

import numpy as np
import re
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score

from pathlib import Path
from dataclasses import dataclass


@dataclass
class CFG:
    # 見落とし注意！
    run_id = "20240831025049"
    model_name = "vit_tiny_patch16_224.augreg_in21k_ft_in1k"
    model_path = Path(f"/kaggle/input/output-{run_id}")
    img_size = 224 #384 or 224
    folds = [0,1,2,3,4]

    # auto
    oof_path = model_path / "oof_predictions.csv"

    # --- desctiption
        # "cv":  0.147796,
        # "lb": ,
        # "version": "",
        # "config": "vit_tiny_patch16_224.augreg_in21k_ft_in1k",
        # "date": 20240830233240,
        # "desc": "lr1e-4 + warmup + train1:1-val1:10 + upsample-2 + AugumentMore + Dropoutx1 + CustomHead + weight_decay1e-3",
        # "desc": "remap case1",
    # ---------------

cfg = CFG()

test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv')
subm_path  = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv')

id_col = 'isic_id'

def read_data(path, cfg):
    return (
        pl.read_csv(path)
        .to_pandas()
        .set_index(id_col)
    )

df_test = read_data(test_path, cfg)
df_subm = pd.read_csv(subm_path, index_col=id_col)

# === ImageNet inference code

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
import timm
from torchvision import transforms
from PIL import Image
import io
import albumentations as A
from albumentations.pytorch import ToTensorV2


class ISICDataset(Dataset):
    def __init__(self, hdf5_file, isic_ids, targets=None, transform=None):
        self.hdf5_file = h5py.File(hdf5_file, 'r')  # Keep file open
        self.isic_ids = isic_ids
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, idx):
        img_bytes = self.hdf5_file[self.isic_ids[idx]][()]
        img = Image.open(io.BytesIO(img_bytes))
        img = np.array(img)

        if self.transform:
            transformed = self.transform(image=img)
            img = transformed['image']

        target = self.targets[idx] if self.targets is not None else torch.tensor(-1)
        return img, target

    def __del__(self):
        self.hdf5_file.close()  # Ensure file is closed when object is destroyed

# Define the albumentations transformation
base_transform = A.Compose([
    A.Resize(cfg.img_size, cfg.img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)

    def __repr__(self):
        return (
            self.__class__.__name__
            + "("
            + "p="
            + "{:.4f}".format(self.p.data.tolist()[0])
            + ", "
            + "eps="
            + str(self.eps)
            + ")"
        )

class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, pretrained=False, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model_name = model_name
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        if "eva02" in self.model_name:
            in_features = self.model.head.in_features
            self.model.head = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "efficientnetv2" in self.model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            self.pooling = GeM()
            if self.pooling:  # My custom pooling
                self.model.global_pool = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "convnextv2" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.head = nn.Sequential(
                nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
                nn.Dropout(0.5),  # ドロップアウト
                nn.Linear(256, num_classes),  # 出力層（2クラス分類）
            )
        elif "swinv2" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                # nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                GeM(),
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(256, num_classes)  # 出力層（2クラス分類）
        elif "vit_tiny" in self.model_name:
            in_features = self.model.head.in_features
            self.model.norm = nn.Identity()
            self.model.fc_norm = nn.Identity()
            self.model.head_drop = nn.Identity()
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 64),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(1)])  # 5つのDropout
            self.classifier = nn.Linear(64, num_classes)  # 出力層（2クラス分類）

    def forward(self, images):
        features = self.model(images)

        if any(
            [
                "efficientnetv2" in self.model_name,
                "eva02" in self.model_name,
            ]
        ):
            # Custom poolingがある場合
            if self.pooling:
                features = self.pooling(features).flatten(1)
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    output = self.linear(dropout(features))
                else:
                    output += self.linear(dropout(features))
            output /= len(self.dropouts)

        if "convnextv2" in self.model_name:
            output = self.head(features)

        if "swinv2" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "vit_tiny" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        return output.squeeze()




def get_latest_epoch_file(folder_path, target_fold):
    # 正規表現パターン：fold_X_epoch_Y_score_Z.pth の形式に一致
    pattern = re.compile(r"fold_(\d+)_epoch_(\d+)_score_(\d+\.\d+)\.pth") # 新パターン
#     pattern = re.compile(r"model_fold_(\d+)_epoch_(\d+)\.pth") # 旧パターン

    max_epoch = -1
    latest_file = None

    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            fold, epoch, score = match.groups() # 新パターン
#             fold, epoch = match.groups() # 旧パターン
            fold = int(fold)
            epoch = int(epoch)

            if fold == target_fold and epoch > max_epoch:
                max_epoch = epoch
                latest_file = filename

    if latest_file:
        return os.path.join(folder_path, latest_file)
    else:
        return None

def load_models(folds, device):
    models = []
    for fold in folds:
        model = ISICModel(cfg.model_name)
        model.to(device)
        model_w_path = get_latest_epoch_file(cfg.model_path, fold)
        model.load_state_dict(torch.load(model_w_path, map_location=device))
        model.eval()
        models.append(model)
    return models


@torch.no_grad()  # Apply no_grad to the entire function
def ensemble_predict(models, test_loader, device):
    all_predictions = []
    for inputs, _ in tqdm(test_loader, desc="Predicting"):
        inputs = inputs.to(device)
#         fold_predictions = torch.stack([model(inputs).softmax(dim=1)[:, 1] for model in models])
        fold_predictions = torch.stack([model(inputs) for model in models])
        avg_predictions = fold_predictions.mean(dim=0)
        all_predictions.extend(avg_predictions.cpu().numpy())
    return all_predictions


# === Do ImageNet inference on test data / merge df_test
TEST_HDF5_FILE_PATH = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

# Set up CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# folds to use for pred
folds = cfg.folds

models = load_models(folds, device)

# Prepare your test dataset
test_dataset = ISICDataset(
    hdf5_file=TEST_HDF5_FILE_PATH,
    isic_ids=df_test.index.values,  #minor change here from
    transform=base_transform,
)

# Create test data loader
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

# Run predictions
predictions = ensemble_predict(models, test_loader, device)

# Create a new DataFrame with predictions
temp_df = pd.DataFrame({"image_predict": predictions}, index=df_test.index)

# Join the predictions to df_test
df_test = df_test.join(temp_df)

df_subm["target"] = df_test["image_predict"]

df_subm.to_csv(f'submission_{cfg.run_id}.csv')
df_subm.head()

Using device: cuda


Predicting: 100%|██████████| 1/1 [00:00<00:00,  5.65it/s]


Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,-1.637331
ISIC_0015729,-3.406027
ISIC_0015740,-4.912541


# Image model 20240902001446

In [16]:
import os
from pathlib import Path

import numpy as np
import re
import pandas as pd
import polars as pl

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score

from pathlib import Path
from dataclasses import dataclass


@dataclass
class CFG:
    # 見落とし注意！
    run_id = "20240902001446"
    model_name = "vit_small_patch16_224.augreg_in21k_ft_in1k"
    model_path = Path(f"/kaggle/input/output-{run_id}")
    img_size = 224 #384 or 224
    folds = [0,1,2,3,4]

    # auto
    oof_path = model_path / "oof_predictions.csv"

    # --- desctiption
        # "cv":  0.1486832,
        # "lb": ,
        # "version": "",
        # "config": "vit_small_patch16_224.augreg_in21k_ft_in1k",
        # "date": 20240902001446,
        # "desc": "lr1e-4 + warmup + train1:1-val1:10 + upsample-2 + AugumentMore + Dropoutx1 + CustomHead-64 + weight_decay1e-3",
        # "desc": "remap case1",
    # ---------------

cfg = CFG()

test_path = Path('/kaggle/input/isic-2024-challenge/test-metadata.csv')
subm_path  = Path('/kaggle/input/isic-2024-challenge/sample_submission.csv')

id_col = 'isic_id'

def read_data(path, cfg):
    return (
        pl.read_csv(path)
        .to_pandas()
        .set_index(id_col)
    )

df_test = read_data(test_path, cfg)
df_subm = pd.read_csv(subm_path, index_col=id_col)

# === ImageNet inference code

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
import timm
from torchvision import transforms
from PIL import Image
import io
import albumentations as A
from albumentations.pytorch import ToTensorV2


class ISICDataset(Dataset):
    def __init__(self, hdf5_file, isic_ids, targets=None, transform=None):
        self.hdf5_file = h5py.File(hdf5_file, 'r')  # Keep file open
        self.isic_ids = isic_ids
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.isic_ids)

    def __getitem__(self, idx):
        img_bytes = self.hdf5_file[self.isic_ids[idx]][()]
        img = Image.open(io.BytesIO(img_bytes))
        img = np.array(img)

        if self.transform:
            transformed = self.transform(image=img)
            img = transformed['image']

        target = self.targets[idx] if self.targets is not None else torch.tensor(-1)
        return img, target

    def __del__(self):
        self.hdf5_file.close()  # Ensure file is closed when object is destroyed

# Define the albumentations transformation
base_transform = A.Compose([
    A.Resize(cfg.img_size, cfg.img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)

    def __repr__(self):
        return (
            self.__class__.__name__
            + "("
            + "p="
            + "{:.4f}".format(self.p.data.tolist()[0])
            + ", "
            + "eps="
            + str(self.eps)
            + ")"
        )

class ISICModel(nn.Module):
    def __init__(self, model_name, num_classes=1, pretrained=False, checkpoint_path=None):
        super(ISICModel, self).__init__()
        self.model_name = model_name
        self.model = timm.create_model(model_name, pretrained=pretrained, checkpoint_path=checkpoint_path)

        if "eva02_small_patch14_336" in self.model_name:
            in_features = self.model.head.in_features
            self.model.head = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "eva02_small_patch14_224" in self.model_name:
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(192, 32),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(32, num_classes)  # 出力層（2クラス分類）
        elif "eva02_tiny_patch14_224" in self.model_name:
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(192, 32),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(32, num_classes)  # 出力層（2クラス分類）
        elif "efficientnetv2" in self.model_name:
            in_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
            self.pooling = GeM()
            if self.pooling:  # My custom pooling
                self.model.global_pool = nn.Identity()
            self.linear = nn.Linear(in_features, num_classes)
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
        elif "convnextv2_atto" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                # nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                GeM(),
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 32),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(32, num_classes)  # 出力層（2クラス分類）
        elif "convnextv2_nano" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                # nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                GeM(),
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(256, num_classes)  # 出力層（2クラス分類）
        elif "swinv2" in self.model_name:
            in_features = self.model.head.fc.in_features
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                # nn.AdaptiveAvgPool2d((1, 1)),  # Global Average Pooling
                GeM(),
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 256),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])  # 5つのDropout
            self.classifier = nn.Linear(256, num_classes)  # 出力層（2クラス分類）
        elif "vit_tiny" in self.model_name:
            in_features = self.model.head.in_features
            self.model.norm = nn.Identity()
            self.model.fc_norm = nn.Identity()
            self.model.head_drop = nn.Identity()
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 64),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(1)])  # 5つのDropout
            self.classifier = nn.Linear(64, num_classes)  # 出力層（2クラス分類）
        elif "vit_small" in self.model_name:
            in_features = self.model.head.in_features
            self.model.norm = nn.Identity()
            self.model.fc_norm = nn.Identity()
            self.model.head_drop = nn.Identity()
            self.model.head = nn.Identity()
            self.feature_extractor = nn.Sequential(
                nn.Flatten(),  # フラット化
                nn.Linear(in_features, 64),  # 新しい全結合層1
                nn.ReLU(),  # 活性化関数
            )
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(1)])  # 5つのDropout
            self.classifier = nn.Linear(64, num_classes)  # 出力層（2クラス分類）

    def forward(self, images):
        features = self.model(images)

        if any(
            [
                "efficientnetv2" in self.model_name,
                "eva02_small_patch14_336" in self.model_name,
            ]
        ):
            # Custom poolingがある場合
            if self.pooling:
                features = self.pooling(features).flatten(1)
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    output = self.linear(dropout(features))
                else:
                    output += self.linear(dropout(features))
            output /= len(self.dropouts)

            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "eva02_small_patch14_224" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "eva02_tiny_patch14_224" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "convnextv2" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "swinv2" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        if "vit_tiny" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)
        if "vit_small" in self.model_name:
            features = self.feature_extractor(features)
            output = torch.mean(torch.stack([dropout(features) for dropout in self.dropouts]), dim=0)
            output = self.classifier(output)

        return output.squeeze()




def get_latest_epoch_file(folder_path, target_fold):
    # 正規表現パターン：fold_X_epoch_Y_score_Z.pth の形式に一致
    pattern = re.compile(r"fold_(\d+)_epoch_(\d+)_score_(\d+\.\d+)\.pth") # 新パターン
#     pattern = re.compile(r"model_fold_(\d+)_epoch_(\d+)\.pth") # 旧パターン

    max_epoch = -1
    latest_file = None

    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            fold, epoch, score = match.groups() # 新パターン
#             fold, epoch = match.groups() # 旧パターン
            fold = int(fold)
            epoch = int(epoch)

            if fold == target_fold and epoch > max_epoch:
                max_epoch = epoch
                latest_file = filename

    if latest_file:
        return os.path.join(folder_path, latest_file)
    else:
        return None

def load_models(folds, device):
    models = []
    for fold in folds:
        model = ISICModel(cfg.model_name)
        model.to(device)
        model_w_path = get_latest_epoch_file(cfg.model_path, fold)
        model.load_state_dict(torch.load(model_w_path, map_location=device))
        model.eval()
        models.append(model)
    return models


@torch.no_grad()  # Apply no_grad to the entire function
def ensemble_predict(models, test_loader, device):
    all_predictions = []
    for inputs, _ in tqdm(test_loader, desc="Predicting"):
        inputs = inputs.to(device)
#         fold_predictions = torch.stack([model(inputs).softmax(dim=1)[:, 1] for model in models])
        fold_predictions = torch.stack([model(inputs) for model in models])
        avg_predictions = fold_predictions.mean(dim=0)
        all_predictions.extend(avg_predictions.cpu().numpy())
    return all_predictions


# === Do ImageNet inference on test data / merge df_test
TEST_HDF5_FILE_PATH = '/kaggle/input/isic-2024-challenge/test-image.hdf5'

# Set up CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# folds to use for pred
folds = cfg.folds

models = load_models(folds, device)

# Prepare your test dataset
test_dataset = ISICDataset(
    hdf5_file=TEST_HDF5_FILE_PATH,
    isic_ids=df_test.index.values,  #minor change here from
    transform=base_transform,
)

# Create test data loader
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

# Run predictions
predictions = ensemble_predict(models, test_loader, device)

# Create a new DataFrame with predictions
temp_df = pd.DataFrame({"image_predict": predictions}, index=df_test.index)

# Join the predictions to df_test
df_test = df_test.join(temp_df)

df_subm["target"] = df_test["image_predict"]

df_subm.to_csv(f'submission_{cfg.run_id}.csv')
df_subm.head()

Using device: cuda


Predicting: 100%|██████████| 1/1 [00:00<00:00,  5.42it/s]


Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,-2.898454
ISIC_0015729,-4.748341
ISIC_0015740,-6.205088


# DATA PREPROCESS

In [17]:
root = Path('/kaggle/input/isic-2024-challenge')

train_path = root / 'train-metadata.csv'
test_path = root / 'test-metadata.csv'
subm_path = root / 'sample_submission.csv'

id_col = 'isic_id'
target_col = 'target'
group_col = 'patient_id'

err = 1e-5
sampling_ratio = 0.01
seed = 42

num_cols = [
    'age_approx',                        # Approximate age of patient at time of imaging.
    'clin_size_long_diam_mm',            # Maximum diameter of the lesion (mm).+
    'tbp_lv_A',                          # A inside  lesion.+
    'tbp_lv_Aext',                       # A outside lesion.+
    'tbp_lv_B',                          # B inside  lesion.+
    'tbp_lv_Bext',                       # B outside lesion.+
    'tbp_lv_C',                          # Chroma inside  lesion.+
    'tbp_lv_Cext',                       # Chroma outside lesion.+
    'tbp_lv_H',                          # Hue inside the lesion; calculated as the angle of A* and B* in LAB* color space. Typical values range from 25 (red) to 75 (brown).+
    'tbp_lv_Hext',                       # Hue outside lesion.+
    'tbp_lv_L',                          # L inside lesion.+
    'tbp_lv_Lext',                       # L outside lesion.+
    'tbp_lv_areaMM2',                    # Area of lesion (mm^2).+
    'tbp_lv_area_perim_ratio',           # Border jaggedness, the ratio between lesions perimeter and area. Circular lesions will have low values; irregular shaped lesions will have higher values. Values range 0-10.+
    'tbp_lv_color_std_mean',             # Color irregularity, calculated as the variance of colors within the lesion's boundary.
    'tbp_lv_deltaA',                     # Average A contrast (inside vs. outside lesion).+
    'tbp_lv_deltaB',                     # Average B contrast (inside vs. outside lesion).+
    'tbp_lv_deltaL',                     # Average L contrast (inside vs. outside lesion).+
    'tbp_lv_deltaLB',                    #
    'tbp_lv_deltaLBnorm',                # Contrast between the lesion and its immediate surrounding skin. Low contrast lesions tend to be faintly visible such as freckles; high contrast lesions tend to be those with darker pigment. Calculated as the average delta LB of the lesion relative to its immediate background in LAB* color space. Typical values range from 5.5 to 25.+
    'tbp_lv_eccentricity',               # Eccentricity.+
    'tbp_lv_minorAxisMM',                # Smallest lesion diameter (mm).+
    'tbp_lv_nevi_confidence',            # Nevus confidence score (0-100 scale) is a convolutional neural network classifier estimated probability that the lesion is a nevus. The neural network was trained on approximately 57,000 lesions that were classified and labeled by a dermatologist.+,++
    'tbp_lv_norm_border',                # Border irregularity (0-10 scale); the normalized average of border jaggedness and asymmetry.+
    'tbp_lv_norm_color',                 # Color variation (0-10 scale); the normalized average of color asymmetry and color irregularity.+
    'tbp_lv_perimeterMM',                # Perimeter of lesion (mm).+
    'tbp_lv_radial_color_std_max',       # Color asymmetry, a measure of asymmetry of the spatial distribution of color within the lesion. This score is calculated by looking at the average standard deviation in LAB* color space within concentric rings originating from the lesion center. Values range 0-10.+
    'tbp_lv_stdL',                       # Standard deviation of L inside  lesion.+
    'tbp_lv_stdLExt',                    # Standard deviation of L outside lesion.+
    'tbp_lv_symm_2axis',                 # Border asymmetry; a measure of asymmetry of the lesion's contour about an axis perpendicular to the lesion's most symmetric axis. Lesions with two axes of symmetry will therefore have low scores (more symmetric), while lesions with only one or zero axes of symmetry will have higher scores (less symmetric). This score is calculated by comparing opposite halves of the lesion contour over many degrees of rotation. The angle where the halves are most similar identifies the principal axis of symmetry, while the second axis of symmetry is perpendicular to the principal axis. Border asymmetry is reported as the asymmetry value about this second axis. Values range 0-10.+
    'tbp_lv_symm_2axis_angle',           # Lesion border asymmetry angle.+
    'tbp_lv_x',                          # X-coordinate of the lesion on 3D TBP.+
    'tbp_lv_y',                          # Y-coordinate of the lesion on 3D TBP.+
    'tbp_lv_z',                          # Z-coordinate of the lesion on 3D TBP.+
]

new_num_cols = [
    'lesion_size_ratio',             # tbp_lv_minorAxisMM      / clin_size_long_diam_mm
    'lesion_shape_index',            # tbp_lv_areaMM2          / tbp_lv_perimeterMM **2
    'hue_contrast',                  # tbp_lv_H                - tbp_lv_Hext              abs
    'luminance_contrast',            # tbp_lv_L                - tbp_lv_Lext              abs
    'lesion_color_difference',       # tbp_lv_deltaA **2       + tbp_lv_deltaB **2 + tbp_lv_deltaL **2  sqrt
    'border_complexity',             # tbp_lv_norm_border      + tbp_lv_symm_2axis
    'color_uniformity',              # tbp_lv_color_std_mean   / tbp_lv_radial_color_std_max

    'position_distance_3d',          # tbp_lv_x **2 + tbp_lv_y **2 + tbp_lv_z **2  sqrt
    'perimeter_to_area_ratio',       # tbp_lv_perimeterMM      / tbp_lv_areaMM2
    'area_to_perimeter_ratio',       # tbp_lv_areaMM2          / tbp_lv_perimeterMM
    'lesion_visibility_score',       # tbp_lv_deltaLBnorm      + tbp_lv_norm_color
    'symmetry_border_consistency',   # tbp_lv_symm_2axis       * tbp_lv_norm_border
    'consistency_symmetry_border',   # tbp_lv_symm_2axis       * tbp_lv_norm_border / (tbp_lv_symm_2axis + tbp_lv_norm_border)

    'color_consistency',             # tbp_lv_stdL             / tbp_lv_Lext
    'consistency_color',             # tbp_lv_stdL*tbp_lv_Lext / tbp_lv_stdL + tbp_lv_Lext
    'size_age_interaction',          # clin_size_long_diam_mm  * age_approx
    'hue_color_std_interaction',     # tbp_lv_H                * tbp_lv_color_std_mean
    'lesion_severity_index',         # tbp_lv_norm_border      + tbp_lv_norm_color + tbp_lv_eccentricity / 3
    'shape_complexity_index',        # border_complexity       + lesion_shape_index
    'color_contrast_index',          # tbp_lv_deltaA + tbp_lv_deltaB + tbp_lv_deltaL + tbp_lv_deltaLBnorm

    'log_lesion_area',               # tbp_lv_areaMM2          + 1  np.log
    'normalized_lesion_size',        # clin_size_long_diam_mm  / age_approx
    'mean_hue_difference',           # tbp_lv_H                + tbp_lv_Hext    / 2
    'std_dev_contrast',              # tbp_lv_deltaA **2 + tbp_lv_deltaB **2 + tbp_lv_deltaL **2   / 3  np.sqrt
    'color_shape_composite_index',   # tbp_lv_color_std_mean   + bp_lv_area_perim_ratio + tbp_lv_symm_2axis   / 3
    'lesion_orientation_3d',         # tbp_lv_y                , tbp_lv_x  np.arctan2
    'overall_color_difference',      # tbp_lv_deltaA           + tbp_lv_deltaB + tbp_lv_deltaL   / 3

    'symmetry_perimeter_interaction',# tbp_lv_symm_2axis       * tbp_lv_perimeterMM
    'comprehensive_lesion_index',    # tbp_lv_area_perim_ratio + tbp_lv_eccentricity + bp_lv_norm_color + tbp_lv_symm_2axis   / 4
    'color_variance_ratio',          # tbp_lv_color_std_mean   / tbp_lv_stdLExt
    'border_color_interaction',      # tbp_lv_norm_border      * tbp_lv_norm_color
    'border_color_interaction_2',
    'size_color_contrast_ratio',     # clin_size_long_diam_mm  / tbp_lv_deltaLBnorm
    'age_normalized_nevi_confidence',# tbp_lv_nevi_confidence  / age_approx
    'age_normalized_nevi_confidence_2',
    'color_asymmetry_index',         # tbp_lv_symm_2axis       * tbp_lv_radial_color_std_max

    'volume_approximation_3d',       # tbp_lv_areaMM2          * sqrt(tbp_lv_x**2 + tbp_lv_y**2 + tbp_lv_z**2)
    'color_range',                   # abs(tbp_lv_L - tbp_lv_Lext) + abs(tbp_lv_A - tbp_lv_Aext) + abs(tbp_lv_B - tbp_lv_Bext)
    'shape_color_consistency',       # tbp_lv_eccentricity     * tbp_lv_color_std_mean
    'border_length_ratio',           # tbp_lv_perimeterMM      / pi * sqrt(tbp_lv_areaMM2 / pi)
    'age_size_symmetry_index',       # age_approx              * clin_size_long_diam_mm * tbp_lv_symm_2axis
    'index_age_size_symmetry',       # age_approx              * tbp_lv_areaMM2 * tbp_lv_symm_2axis
]

cat_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple', 'attribution']
norm_cols = [f'{col}_patient_norm' for col in num_cols + new_num_cols]
special_cols = ['count_per_patient']
# image_cols = ["target_3","target_effnetv1b0","target_eva02"]
#image_cols = ["target_3","target_effnetv1b0"]
# image_cols = ["target_3","target_effnetv1b0","target_eva02", "image_20240818021241"]
# image_cols = ["image_diff-20240823202308", "image_20240821002557"]
image_cols = [
#     "image_20240824125307",
#     "image_20240825010806",
#     "image_20240826023642",
#     "image_20240827012429",
    "image_20240827234748",
    "image_20240830205516",
    "image_20240831025049",
    "image_20240902001446",
]

#norm_cols += image_cols
feature_cols = num_cols + new_num_cols + cat_cols + norm_cols + special_cols

In [18]:
def read_data(path):
    return (
        pl.read_csv(path)
        .with_columns(
            pl.col('age_approx').cast(pl.String).replace('NA', np.nan).cast(pl.Float64),
        )
        .with_columns(
            pl.col(pl.Float64).fill_nan(pl.col(pl.Float64).median()), # You may want to impute test data with train
        )
        .with_columns(
            lesion_size_ratio              = pl.col('tbp_lv_minorAxisMM') / pl.col('clin_size_long_diam_mm'),
            lesion_shape_index             = pl.col('tbp_lv_areaMM2') / (pl.col('tbp_lv_perimeterMM') ** 2),
            hue_contrast                   = (pl.col('tbp_lv_H') - pl.col('tbp_lv_Hext')).abs(),
            luminance_contrast             = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs(),
            lesion_color_difference        = (pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2).sqrt(),
            border_complexity              = pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_symm_2axis'),
            color_uniformity               = pl.col('tbp_lv_color_std_mean') / (pl.col('tbp_lv_radial_color_std_max') + err),
        )
        .with_columns(
            position_distance_3d           = (pl.col('tbp_lv_x') ** 2 + pl.col('tbp_lv_y') ** 2 + pl.col('tbp_lv_z') ** 2).sqrt(),
            perimeter_to_area_ratio        = pl.col('tbp_lv_perimeterMM') / pl.col('tbp_lv_areaMM2'),
            area_to_perimeter_ratio        = pl.col('tbp_lv_areaMM2') / pl.col('tbp_lv_perimeterMM'),
            lesion_visibility_score        = pl.col('tbp_lv_deltaLBnorm') + pl.col('tbp_lv_norm_color'),
            combined_anatomical_site       = pl.col('anatom_site_general') + '_' + pl.col('tbp_lv_location'),
            symmetry_border_consistency    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border'),
            consistency_symmetry_border    = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_norm_border') / (pl.col('tbp_lv_symm_2axis') + pl.col('tbp_lv_norm_border')),
        )
        .with_columns(
            color_consistency              = pl.col('tbp_lv_stdL') / pl.col('tbp_lv_Lext'),
            consistency_color              = pl.col('tbp_lv_stdL') * pl.col('tbp_lv_Lext') / (pl.col('tbp_lv_stdL') + pl.col('tbp_lv_Lext')),
            size_age_interaction           = pl.col('clin_size_long_diam_mm') * pl.col('age_approx'),
            hue_color_std_interaction      = pl.col('tbp_lv_H') * pl.col('tbp_lv_color_std_mean'),
            lesion_severity_index          = (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_eccentricity')) / 3,
            shape_complexity_index         = pl.col('border_complexity') + pl.col('lesion_shape_index'),
            color_contrast_index           = pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL') + pl.col('tbp_lv_deltaLBnorm'),
        )
        .with_columns(
            log_lesion_area                = (pl.col('tbp_lv_areaMM2') + 1).log(),
            normalized_lesion_size         = pl.col('clin_size_long_diam_mm') / pl.col('age_approx'),
            mean_hue_difference            = (pl.col('tbp_lv_H') + pl.col('tbp_lv_Hext')) / 2,
            std_dev_contrast               = ((pl.col('tbp_lv_deltaA') ** 2 + pl.col('tbp_lv_deltaB') ** 2 + pl.col('tbp_lv_deltaL') ** 2) / 3).sqrt(),
            color_shape_composite_index    = (pl.col('tbp_lv_color_std_mean') + pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_symm_2axis')) / 3,
            lesion_orientation_3d          = pl.arctan2(pl.col('tbp_lv_y'), pl.col('tbp_lv_x')),
            overall_color_difference       = (pl.col('tbp_lv_deltaA') + pl.col('tbp_lv_deltaB') + pl.col('tbp_lv_deltaL')) / 3,
        )
        .with_columns(
            symmetry_perimeter_interaction = pl.col('tbp_lv_symm_2axis') * pl.col('tbp_lv_perimeterMM'),
            comprehensive_lesion_index     = (pl.col('tbp_lv_area_perim_ratio') + pl.col('tbp_lv_eccentricity') + pl.col('tbp_lv_norm_color') + pl.col('tbp_lv_symm_2axis')) / 4,
            color_variance_ratio           = pl.col('tbp_lv_color_std_mean') / pl.col('tbp_lv_stdLExt'),
            border_color_interaction       = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color'),
            border_color_interaction_2     = pl.col('tbp_lv_norm_border') * pl.col('tbp_lv_norm_color') / (pl.col('tbp_lv_norm_border') + pl.col('tbp_lv_norm_color')),
            size_color_contrast_ratio      = pl.col('clin_size_long_diam_mm') / pl.col('tbp_lv_deltaLBnorm'),
            age_normalized_nevi_confidence = pl.col('tbp_lv_nevi_confidence') / pl.col('age_approx'),
            age_normalized_nevi_confidence_2 = (pl.col('clin_size_long_diam_mm')**2 + pl.col('age_approx')**2).sqrt(),
            color_asymmetry_index          = pl.col('tbp_lv_radial_color_std_max') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            volume_approximation_3d        = pl.col('tbp_lv_areaMM2') * (pl.col('tbp_lv_x')**2 + pl.col('tbp_lv_y')**2 + pl.col('tbp_lv_z')**2).sqrt(),
            color_range                    = (pl.col('tbp_lv_L') - pl.col('tbp_lv_Lext')).abs() + (pl.col('tbp_lv_A') - pl.col('tbp_lv_Aext')).abs() + (pl.col('tbp_lv_B') - pl.col('tbp_lv_Bext')).abs(),
            shape_color_consistency        = pl.col('tbp_lv_eccentricity') * pl.col('tbp_lv_color_std_mean'),
            border_length_ratio            = pl.col('tbp_lv_perimeterMM') / (2 * np.pi * (pl.col('tbp_lv_areaMM2') / np.pi).sqrt()),
            age_size_symmetry_index        = pl.col('age_approx') * pl.col('clin_size_long_diam_mm') * pl.col('tbp_lv_symm_2axis'),
            index_age_size_symmetry        = pl.col('age_approx') * pl.col('tbp_lv_areaMM2') * pl.col('tbp_lv_symm_2axis'),
        )
        .with_columns(
            ((pl.col(col) - pl.col(col).mean().over('patient_id')) / (pl.col(col).std().over('patient_id') + err)).alias(f'{col}_patient_norm') for col in (num_cols + new_num_cols)
        )
        .with_columns(
            count_per_patient = pl.col('isic_id').count().over('patient_id'),
        )
        .with_columns(
            pl.col(cat_cols).cast(pl.Categorical),
        )
        .to_pandas()
#         .set_index(id_col)
    )

In [19]:
def preprocess(df_train, df_test):
    global cat_cols

    encoder = OneHotEncoder(sparse_output=False, dtype=np.int32, handle_unknown='ignore')
    encoder.fit(df_train[cat_cols])

    new_cat_cols = [f'onehot_{i}' for i in range(len(encoder.get_feature_names_out()))]

    df_train[new_cat_cols] = encoder.transform(df_train[cat_cols])
    df_train[new_cat_cols] = df_train[new_cat_cols].astype('category')

    df_test[new_cat_cols] = encoder.transform(df_test[cat_cols])
    df_test[new_cat_cols] = df_test[new_cat_cols].astype('category')


    # === swd
    run_id = "20240827234748"
    df_train_image = pd.read_csv(f"/kaggle/input/output-{run_id}/oof_predictions.csv")

#     # MinMax-scale
#     for fold in [0, 1, 2, 3, 4]:
#         filter = df_train_image["fold"] == fold
#         _df = df_train_image[filter]
#         scaler = MinMaxScaler()
#         df_train_image.loc[filter, "oof_prediction"]  = scaler.fit_transform(_df["oof_prediction"].values.reshape(-1, 1))

    df_train = df_train.reset_index(drop=True)
    df_train_image = df_train_image.reset_index(drop=True)
    df_train[f"image_{run_id}"] = df_train_image["oof_prediction"]

    df_test_image = pd.read_csv(f"submission_{run_id}.csv")
    df_test = df_test.reset_index(drop=True)
    df_test_image = df_test_image.reset_index(drop=True)

    df_test[f"image_{run_id}"] = df_test_image["target"]

    # === swd
    run_id = "20240830205516"
    df_train_image = pd.read_csv(f"/kaggle/input/output-{run_id}/oof_predictions.csv")

#     # MinMax-scale
#     for fold in [0, 1, 2, 3, 4]:
#         filter = df_train_image["fold"] == fold
#         _df = df_train_image[filter]
#         scaler = MinMaxScaler()
#         df_train_image.loc[filter, "oof_prediction"]  = scaler.fit_transform(_df["oof_prediction"].values.reshape(-1, 1))

    df_train = df_train.reset_index(drop=True)
    df_train_image = df_train_image.reset_index(drop=True)
    df_train[f"image_{run_id}"] = df_train_image["oof_prediction"]

    df_test_image = pd.read_csv(f"submission_{run_id}.csv")
    df_test = df_test.reset_index(drop=True)
    df_test_image = df_test_image.reset_index(drop=True)

    df_test[f"image_{run_id}"] = df_test_image["target"]


    # === swd
    run_id = "20240831025049"
    df_train_image = pd.read_csv(f"/kaggle/input/output-{run_id}/oof_predictions.csv")

#     # MinMax-scale
#     for fold in [0, 1, 2, 3, 4]:
#         filter = df_train_image["fold"] == fold
#         _df = df_train_image[filter]
#         scaler = MinMaxScaler()
#         df_train_image.loc[filter, "oof_prediction"]  = scaler.fit_transform(_df["oof_prediction"].values.reshape(-1, 1))

    df_train = df_train.reset_index(drop=True)
    df_train_image = df_train_image.reset_index(drop=True)
    df_train[f"image_{run_id}"] = df_train_image["oof_prediction"]

    df_test_image = pd.read_csv(f"submission_{run_id}.csv")
    df_test = df_test.reset_index(drop=True)
    df_test_image = df_test_image.reset_index(drop=True)

    df_test[f"image_{run_id}"] = df_test_image["target"]


    # === swd
    run_id = "20240902001446"
    df_train_image = pd.read_csv(f"/kaggle/input/output-{run_id}/oof_predictions.csv")

#     # MinMax-scale
#     for fold in [0, 1, 2, 3, 4]:
#         filter = df_train_image["fold"] == fold
#         _df = df_train_image[filter]
#         scaler = MinMaxScaler()
#         df_train_image.loc[filter, "oof_prediction"]  = scaler.fit_transform(_df["oof_prediction"].values.reshape(-1, 1))

    df_train = df_train.reset_index(drop=True)
    df_train_image = df_train_image.reset_index(drop=True)
    df_train[f"image_{run_id}"] = df_train_image["oof_prediction"]

    df_test_image = pd.read_csv(f"submission_{run_id}.csv")
    df_test = df_test.reset_index(drop=True)
    df_test_image = df_test_image.reset_index(drop=True)

    df_test[f"image_{run_id}"] = df_test_image["target"]



# ==============================
    for col in cat_cols:
        feature_cols.remove(col)

    feature_cols.extend(new_cat_cols)
    cat_cols = new_cat_cols

    return df_train, df_test

In [20]:
def custom_metric(y_hat, y_true):
    # y_hat = estimator.predict_proba(X)[:, 1]
    min_tpr = 0.80
    max_fpr = abs(1 - min_tpr)

    v_gt = abs(y_true - 1)
    v_pred = np.array([1.0 - x for x in y_hat])

    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return partial_auc

In [21]:
df_train = read_data(train_path)
df_test = read_data(test_path)
df_subm = pd.read_csv(subm_path, index_col=id_col)

df_train, df_test = preprocess(df_train, df_test)

df_train["iddx_2"] = df_train["iddx_2"].replace("", np.nan)

In [22]:
# Split folds

df_fold = pd.read_csv("/kaggle/input/df-fold/df_fold.csv")

df_train = df_train.merge(df_fold, left_on="isic_id", right_on="isic_id", how="left")

# Add summary
fold_summary = df_train.groupby("fold")["patient_id"].nunique().to_dict()
total_patients = df_train["patient_id"].nunique()

print(f"Fold Summary (patients per fold):")
for fold, count in fold_summary.items():
    if fold != -1:  # Exclude the initialization value
        print(f"Fold {fold}: {count} patients")
print(f"Total patients: {total_patients}")

Fold Summary (patients per fold):
Fold 0: 206 patients
Fold 1: 209 patients
Fold 2: 208 patients
Fold 3: 209 patients
Fold 4: 210 patients
Total patients: 1042


In [23]:
# Additional Filter
print("-"*20)
print("Indeterminate の数")
print(df_train[df_train["iddx_1"]=="Indeterminate"].groupby("fold")["target"].value_counts())

print("-"*20)
print("iddx_2が存在する数")
print(df_train[df_train["iddx_2"].notna()].groupby("fold")["target"].value_counts())

exclude_isic_ids = []
filter = (df_train["target"] == 0) & (df_train["iddx_1"] == "Indeterminate")
exclude_isic_ids.extend(df_train[filter]["isic_id"].values.tolist())
filter = (df_train["target"] == 0) & (df_train["iddx_2"].notna())
exclude_isic_ids.extend(df_train[filter]["isic_id"].values.tolist())

exclude_isic_ids = list(set(exclude_isic_ids))

print("-"*20)
print("処理の妥当性を確認（target=0におけるIndeterminateが消えていることを確認）")
print(df_train[~df_train["isic_id"].isin(exclude_isic_ids)].groupby(["fold", "target"])["iddx_1"].value_counts())

print("-"*20)
print("処理の妥当性を確認（target=0におけるIddx_2が消えていることを確認）")
print(df_train[~df_train["isic_id"].isin(exclude_isic_ids)].groupby(["fold", "target"])["iddx_2"].value_counts())

--------------------
Indeterminate の数
fold  target
0     0         17
1     0         32
2     0         12
3     0         25
4     0         28
Name: count, dtype: int64
--------------------
iddx_2が存在する数
fold  target
0     0         128
      1          77
1     0         142
      1          78
2     0         130
      1          80
3     0         141
      1          80
4     0         134
      1          78
Name: count, dtype: int64
--------------------
処理の妥当性を確認（target=0におけるIndeterminateが消えていることを確認）
fold  target  iddx_1   
0     0       Benign       80009
      1       Malignant       77
1     0       Benign       79992
      1       Malignant       78
2     0       Benign       80001
      1       Malignant       80
3     0       Benign       79990
      1       Malignant       80
4     0       Benign       79999
      1       Malignant       78
Name: count, dtype: int64
--------------------
処理の妥当性を確認（target=0におけるIddx_2が消えていることを確認）
fold  target  iddx_2                        

In [24]:
df_train.head()

Unnamed: 0,isic_id,target,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,...,onehot_42,onehot_43,onehot_44,onehot_45,onehot_46,image_20240827234748,image_20240830205516,image_20240831025049,image_20240902001446,fold
0,ISIC_0015670,0,IP_1235828,60.0,male,lower extremity,3.04,TBP tile: close-up,3D: white,20.244422,...,0,0,1,0,0,-1.826628,-4.128582,-3.712779,-2.799356,3
1,ISIC_0015845,0,IP_8170065,60.0,male,head/neck,1.1,TBP tile: close-up,3D: white,31.71257,...,0,0,1,0,0,-3.171355,-0.40982,-0.458478,0.351795,0
2,ISIC_0015864,0,IP_6724798,60.0,male,posterior torso,3.4,TBP tile: close-up,3D: XP,22.57583,...,0,0,1,0,0,-4.405538,-5.672004,-5.391024,-6.028082,4
3,ISIC_0015902,0,IP_4111386,65.0,male,anterior torso,3.22,TBP tile: close-up,3D: XP,14.242329,...,0,0,0,0,0,-3.528047,-3.939052,-2.01771,-2.314598,1
4,ISIC_0024200,0,IP_8313778,55.0,male,anterior torso,2.73,TBP tile: close-up,3D: white,24.72552,...,0,0,1,0,0,-1.711912,-4.667692,-1.609399,-2.456065,0


In [25]:
df_test.head()

Unnamed: 0,isic_id,patient_id,age_approx,sex,anatom_site_general,clin_size_long_diam_mm,image_type,tbp_tile_type,tbp_lv_A,tbp_lv_Aext,...,onehot_41,onehot_42,onehot_43,onehot_44,onehot_45,onehot_46,image_20240827234748,image_20240830205516,image_20240831025049,image_20240902001446
0,ISIC_0015657,IP_6074337,45.0,male,posterior torso,2.7,TBP tile: close-up,3D: XP,22.80433,20.00727,...,0,0,0,1,0,0,-0.501967,-2.714859,-1.637331,-2.898454
1,ISIC_0015729,IP_1664139,35.0,female,lower extremity,2.52,TBP tile: close-up,3D: XP,16.64867,9.657964,...,0,0,1,0,0,0,-4.002757,-5.223221,-3.406027,-4.748341
2,ISIC_0015740,IP_7142616,65.0,male,posterior torso,3.16,TBP tile: close-up,3D: XP,24.25384,19.93738,...,0,0,0,0,0,0,-3.307847,-4.123434,-4.912541,-6.205088


In [26]:
#they are detected at the first run
least_important_features = ['onehot_32', 'onehot_6', 'onehot_33', 'onehot_30', 'onehot_26', 'onehot_22', 'onehot_36', 'onehot_4']
#they are detected after the least_important_features are removed and it has increased cv score also so I add it
#least_important_features_2 = ['onehot_17', 'onehot_42', 'onehot_29', 'onehot_13', 'onehot_25']
#least_important_features += least_important_features_2
df_train.drop(columns =least_important_features,inplace = True)
for feature in least_important_features:
    cat_cols.remove(feature)
    feature_cols.remove(feature)

# MODEL INITIALIZATION

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
import copy

feature_cols_without_image_cols = copy.copy(feature_cols)
feature_cols += image_cols

class SelectColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [28]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321,
    'lambda_l2':        0.0039689175176025465,
    'learning_rate':    0.03231007103195577,
    'max_depth':        4,
    'num_leaves':       103,
    'colsample_bytree': 0.8329551585827726,
    'colsample_bynode': 0.4025961355653304,
    'bagging_fraction': 0.7738954452473223,
    'bagging_freq':     4,
    'min_data_in_leaf': 85,
    'scale_pos_weight': 2.7984184778875543,
}


sampling_ratio = 0.01
seed =42

lgb_model1 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

In [29]:
lgb_params = {
    'objective':        'binary',
    'verbosity':        -1,
    'n_iter':           200,
    'boosting_type':    'gbdt',
    'random_state':     seed,
    'lambda_l1':        0.08758718919397321,
    'lambda_l2':        0.0039689175176025465,
    'learning_rate':    0.03231007103195577,
    'max_depth':        4,
    'num_leaves':       103,
    'colsample_bytree': 0.8329551585827726,
    'colsample_bynode': 0.4025961355653304,
    'bagging_fraction': 0.7738954452473223,
    'bagging_freq':     4,
    'min_data_in_leaf': 85,
    'scale_pos_weight': 2.7984184778875543,
}


sampling_ratio = 0.01
seed =42

lgb_model2 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', lgb.LGBMClassifier(**lgb_params)),
])

In [30]:
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7,
    'learning_rate':     0.06936242010150652,
    'scale_pos_weight':  2.6149345838209532,
    'l2_leaf_reg':       6.216113851699493,
    'subsample':         0.6249261779711819,
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}
cb_model1 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [31]:
cb_params = {
    'loss_function':     'Logloss',
    'iterations':        250,
    'verbose':           False,
    'random_state':      seed,
    'max_depth':         7,
    'learning_rate':     0.06936242010150652,
    'scale_pos_weight':  2.6149345838209532,
    'l2_leaf_reg':       6.216113851699493,
    'subsample':         0.6249261779711819,
    'min_data_in_leaf':  24,
    'cat_features':      cat_cols,
}
cb_model2 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', cb.CatBoostClassifier(**cb_params)),
])

In [32]:
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.08501257473292347,
    'lambda':             8.879624125465703,
    'alpha':              0.6779926606782505,
    'max_depth':          6,
    'subsample':          0.6012681388711075,
    'colsample_bytree':   0.8437772277074493,
    'colsample_bylevel':  0.5476090898823716,
    'colsample_bynode':   0.9928601203635129,
    'scale_pos_weight':   3.29440313334688,
}

xgb_model1 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('filter', SelectColumns(feature_cols_without_image_cols)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

In [33]:
xgb_params = {
    'enable_categorical': True,
    'tree_method':        'hist',
    'random_state':       seed,
    'learning_rate':      0.08501257473292347,
    'lambda':             8.879624125465703,
    'alpha':              0.6779926606782505,
    'max_depth':          6,
    'subsample':          0.6012681388711075,
    'colsample_bytree':   0.8437772277074493,
    'colsample_bylevel':  0.5476090898823716,
    'colsample_bynode':   0.9928601203635129,
    'scale_pos_weight':   3.29440313334688,
}

xgb_model2 = Pipeline([
    ('sampler_1', RandomOverSampler(sampling_strategy= 0.003 , random_state=seed)),
    ('sampler_2', RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=seed)),
    ('classifier', xgb.XGBClassifier(**xgb_params)),
])

# CROSS VALIDATION

In [38]:
feature_cols

['age_approx',
 'clin_size_long_diam_mm',
 'tbp_lv_A',
 'tbp_lv_Aext',
 'tbp_lv_B',
 'tbp_lv_Bext',
 'tbp_lv_C',
 'tbp_lv_Cext',
 'tbp_lv_H',
 'tbp_lv_Hext',
 'tbp_lv_L',
 'tbp_lv_Lext',
 'tbp_lv_areaMM2',
 'tbp_lv_area_perim_ratio',
 'tbp_lv_color_std_mean',
 'tbp_lv_deltaA',
 'tbp_lv_deltaB',
 'tbp_lv_deltaL',
 'tbp_lv_deltaLB',
 'tbp_lv_deltaLBnorm',
 'tbp_lv_eccentricity',
 'tbp_lv_minorAxisMM',
 'tbp_lv_nevi_confidence',
 'tbp_lv_norm_border',
 'tbp_lv_norm_color',
 'tbp_lv_perimeterMM',
 'tbp_lv_radial_color_std_max',
 'tbp_lv_stdL',
 'tbp_lv_stdLExt',
 'tbp_lv_symm_2axis',
 'tbp_lv_symm_2axis_angle',
 'tbp_lv_x',
 'tbp_lv_y',
 'tbp_lv_z',
 'lesion_size_ratio',
 'lesion_shape_index',
 'hue_contrast',
 'luminance_contrast',
 'lesion_color_difference',
 'border_complexity',
 'color_uniformity',
 'position_distance_3d',
 'perimeter_to_area_ratio',
 'area_to_perimeter_ratio',
 'lesion_visibility_score',
 'symmetry_border_consistency',
 'consistency_symmetry_border',
 'color_consisten

In [39]:
scores_filtered = []
scores = []
estimators = []

all_val_targets, all_val_outputs = [], []
oof_predictions = np.zeros(len(df_train))
for fold in cfg_global.train_folds:

    # model
    estimator = VotingClassifier([
        ('lgb1', lgb_model1),
        ('lgb2', lgb_model2),
        # ('cb1', cb_model1),
        ('cb2', cb_model2),
        # ('xgb1', xgb_model1),
        ('xgb2', xgb_model2),
    ], voting='soft')

    # Split data for current fold
    exclude_filter = ~df_train["isic_id"].isin(exclude_isic_ids)
    X_train = df_train[exclude_filter & (df_train["fold"] != fold)][feature_cols]
    X_test = df_train[exclude_filter & (df_train["fold"] == fold)][feature_cols]
    y_train = df_train[exclude_filter & (df_train["fold"] != fold)][target_col]
    y_test = df_train[exclude_filter & (df_train["fold"] == fold)][target_col]

    # モデルのトレーニング
    estimator.fit(X_train, y_train)

    # モデルの評価
    y_hat = estimator.predict_proba(X_test)[:, 1]
    score_filtered = custom_metric(y_hat, y_test.values)

    # OOFを再計算（すべてのデータポイントで）
    X_test = df_train[df_train["fold"] == fold][feature_cols]
    y_test = df_train[df_train["fold"] == fold][target_col]
    y_hat = estimator.predict_proba(X_test)[:, 1]
    score = custom_metric(y_hat, y_test.values)
    oof_predictions[y_test.index] = y_hat
    all_val_targets.extend(y_test)
    all_val_outputs.extend(y_hat)

    # スコアとestimatorをリストに保存
    scores_filtered.append(score_filtered)
    scores.append(score)
    estimators.append(estimator)

    print(f"Fold {fold} score: {score_filtered}")
    print(f"Fold {fold} score: {score}")

all_val_outputs = np.array(all_val_outputs)
all_val_targets = np.array(all_val_targets)
oof_predictions = np.array(oof_predictions)
# print(custom_metric(all_val_outputs, all_val_targets))



Fold 0 score: 0.19238429832487994
Fold 0 score: 0.1922283414328287




Fold 1 score: 0.18112400983688107
Fold 1 score: 0.18089984532318618




Fold 2 score: 0.17752934338320767
Fold 2 score: 0.17732962274275868




Fold 3 score: 0.1848868608576072
Fold 3 score: 0.18467883840211652




Fold 4 score: 0.18054818633950864
Fold 4 score: 0.18035864733854323


In [40]:
oof_df = pd.DataFrame(
    {
        "isic_id": df_train["isic_id"],
        "target": df_train["target"],
        "fold": df_train["fold"],
        "oof_prediction": oof_predictions,
    }
)

In [41]:
print("-"*20)
print("OOF filtered score")
y_hat = oof_df[~oof_df["isic_id"].isin(exclude_isic_ids)]["oof_prediction"]
y_true = oof_df[~oof_df["isic_id"].isin(exclude_isic_ids)]["target"]
print(custom_metric(y_hat,y_true))

print("-"*20)
print("OOF all score")
y_hat = oof_df["oof_prediction"]
y_true = oof_df["target"]
print(custom_metric(y_hat, y_true))

print("-"*20)
print("OOF filtered score folds mean")
print(np.mean(scores_filtered))
print("-"*20)
print("OOF score folds mean")
print(np.mean(scores))

--------------------
OOF filtered score
0.1833008201972076
--------------------
OOF all score
0.1831055452976138
--------------------
OOF filtered score folds mean
0.1832945397484169
--------------------
OOF score folds mean
0.18309905904788665


# HYPERPARAMETER TUNING

In [42]:
DO_TUNING = False

In [43]:
def cb_objective(trial):
    params = {
        'loss_function':     'Logloss',
        'iterations':        200,
        'verbose':           False,
        'random_state':      seed,
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':         trial.suggest_int('max_depth', 4, 8),
        'l2_leaf_reg':       trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'subsample':         trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 1.0),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 5, 100),
        'scale_pos_weight':  trial.suggest_float('scale_pos_weight', 0.8, 4.0),
        'bootstrap_type':    'Bayesian',  # Optional: depending on your use case, you may want to tune this as well
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', cb.CatBoostClassifier(**params)),
    ])

    X = df_train[feature_cols]
    y = df_train[target_col]
    groups = df_train[group_col]
    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [44]:
def lgb_objective(trial):
    params = {
        'objective':         'binary',
        'verbosity':         -1,
        'n_iter': 200,
        'boosting_type':  'gbdt',
        'lambda_l1':         trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
        'lambda_l2':         trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
        'learning_rate':     trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':         trial.suggest_int('max_depth', 4, 8),
        'num_leaves':        trial.suggest_int('num_leaves', 16, 256),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode':  trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'bagging_fraction':  trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq':      trial.suggest_int('bagging_freq', 1, 7),
        'min_data_in_leaf':  trial.suggest_int('min_data_in_leaf', 5, 100),
        'scale_pos_weight' : trial.suggest_float('scale_pos_weight', 0.8, 4.0),
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', lgb.LGBMClassifier(**params)),
    ])

    X = df_train[feature_cols]
    y = df_train[target_col]
    groups = df_train[group_col]
    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [45]:
def xgb_objective(trial):
    params = {
        'objective':          'binary:logistic',
        'n_estimators':       200,
        'tree_method':        'hist',
        'random_state':       seed,
        'learning_rate':      trial.suggest_float('learning_rate', 1e-2, 1e-1, log=True),
        'max_depth':          trial.suggest_int('max_depth', 4, 8),
        'lambda':             trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha':              trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'subsample':          trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode':   trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'scale_pos_weight':   trial.suggest_float('scale_pos_weight', 0.8, 4.0),
    }

    estimator = Pipeline([
        ('sampler', RandomUnderSampler(sampling_strategy=sampling_ratio)),
        ('classifier', xgb.XGBClassifier(**params)),
    ])

    X = df_train[feature_cols]
    y = df_train[target_col]
    groups = df_train[group_col]
    cv = StratifiedGroupKFold(5, shuffle=True)

    val_score = cross_val_score(
        estimator=estimator,
        X=X, y=y,
        cv=cv,
        groups=groups,
        scoring=custom_metric,
    )

    return np.mean(val_score)

In [46]:
if DO_TUNING:
    # LightGBM
    start_time = time.time()
    study_lgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_lgb.optimize(objective_lgb, n_trials=100)
    end_time = time.time()
    elapsed_time_lgb = end_time - start_time
    print(f"LightGBM tuning took {elapsed_time_lgb:.2f} seconds.")


In [47]:
if DO_TUNING:
    # CatBoost
    start_time = time.time()
    study_cb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_cb.optimize(objective_cb, n_trials=100)
    end_time = time.time()
    elapsed_time_cb = end_time - start_time
    print(f"CatBoost tuning took {elapsed_time_cb:.2f} seconds.")

In [48]:
if DO_TUNING:
    # XGBoost
    start_time = time.time()
    study_xgb = optuna.create_study(direction='maximize', sampler=TPESampler(seed=seed))
    study_xgb.optimize(objective_xgb, n_trials=100)
    end_time = time.time()
    elapsed_time_xgb = end_time - start_time
    print(f"XGBoost tuning took {elapsed_time_xgb:.2f} seconds.")

In [49]:
if DO_TUNING:
    # Print best parameters for each study
    print("Best LGBM trial:", study_lgb.best_trial)
    print("Best CatBoost trial:", study_cb.best_trial)
    print("Best XGBoost trial:", study_xgb.best_trial)

# TRAINING

In [50]:
X, y = df_train[feature_cols], df_train[target_col]

final_estimator = VotingClassifier([
    ('lgb1', lgb_model1),
    ('lgb2', lgb_model2),
    # ('cb1', cb_model1),
    ('cb2', cb_model2),
    # ('xgb1', xgb_model1),
    ('xgb2', xgb_model2),
], voting='soft')

final_estimator.fit(X, y)



# LOOKING FOR FEATURE IMPORTANCE(lgb + xgb)

In [51]:
DO_FEATURE_IMPORTANCE_MODELS = False

In [52]:
if DO_FEATURE_IMPORTANCE_MODELS:
    lgb_model = estimator.named_estimators_['lgb'].named_steps['classifier']
    lgb_feature_importance = lgb_model.booster_.feature_importance(importance_type='gain')
    lgb_feature_importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_feature_importance
    }).sort_values(by='importance', ascending=False)


    xgb_model = estimator.named_estimators_['xgb'].named_steps['classifier']
    xgb_feature_importance = xgb_model.get_booster().get_score(importance_type='weight')
    xgb_feature_importance_df = pd.DataFrame({
        'feature': xgb_feature_importance.keys(),
        'importance': xgb_feature_importance.values()
    }).sort_values(by='importance', ascending=False)


In [53]:
if DO_FEATURE_IMPORTANCE_MODELS:

    print(lgb_feature_importance_df)
    print(xgb_feature_importance_df)

# LEAST IMPORTANT FEATURES

In [54]:
if DO_FEATURE_IMPORTANCE_MODELS:

    # Assuming lgb_feature_importance_df is already created and contains the feature importances
    least_important_lgb = lgb_feature_importance_df.sort_values(by='importance').head(24)

    print("Least Important Features in LightGBM:")
    print(least_important_lgb)

    # Convert the xgb_feature_importance to a DataFrame for easier manipulation
    least_important_xgb = xgb_feature_importance_df.sort_values(by = "importance").head(6)


    print("\nLeast Important Features in XGBoost:")
    print(least_important_xgb)

In [55]:
if DO_FEATURE_IMPORTANCE_MODELS:

    # Extract the least important feature names from both LightGBM and XGBoost
    least_important_lgb_features = least_important_lgb['feature'].tolist()
    least_important_xgb_features = least_important_xgb['feature'].tolist()

    # Find the intersection of the two lists
    common_least_important_features = list(set(least_important_lgb_features) & set(least_important_xgb_features))

    print("Common Least Important Features in Both LightGBM and XGBoost:")
    print(common_least_important_features)


# LOOKING FOR FEATURE IMPORTANCE(Tests)

In [56]:
DO_FEATURE_IMPORTANCE_TEST = False

In [57]:
if DO_FEATURE_IMPORTANCE_TEST:
    X = df_train[feature_cols]
    y = df_train[target_col]

    # Separate continuous and categorical features
    continuous_features = num_cols + norm_cols + new_num_cols
    # Fill null values of continuous features with their median values
    X[continuous_features] = X[continuous_features].apply(lambda x: x.fillna(x.median()))


    # Correlation Matrix for continuous features
    corr_matrix = df_train[continuous_features + ['target']].corr()
    threshold = 0.01
    relevant_features_corr = corr_matrix[abs(corr_matrix['target']) > threshold].index
    selected_features_corr = relevant_features_corr.drop('target')
    print("Selected continuous features based on correlation threshold:")
    print(selected_features_corr)
    print(len(selected_features_corr))

    # Chi-Square Test for categorical features
    chi2_selector = SelectKBest(chi2, k=15)
    chi2_selector.fit_transform(X[cat_cols], y)
    selected_features_chi2 = X[cat_cols].columns[chi2_selector.get_support()]
    print("Selected categorical features based on Chi-Square Test:")
    print(selected_features_chi2)

    # Mutual Information for all features
    mi_selector = SelectKBest(mutual_info_classif, k=15)
    mi_selector.fit_transform(X, y)
    selected_features_mi = X.columns[mi_selector.get_support()]
    print("Selected features based on Mutual Information:")
    print(selected_features_mi)

    # Variance Threshold for continuous features
    threshold = 0.05
    var_threshold = VarianceThreshold(threshold=threshold)
    var_threshold.fit_transform(X[continuous_features])
    selected_features_var = X[continuous_features].columns[var_threshold.get_support()]
    print("Selected continuous features based on Variance Threshold:")
    print(selected_features_var)
    print(len(selected_features_var))


    # Combine all selected features
    selected_features_all = set(selected_features_corr) | set(selected_features_chi2) | set(selected_features_mi) | set(selected_features_var)

    # Original features
    original_features = set(X.columns)

    # Find features not selected by any method
    least_selected_features = original_features - selected_features_all

    boosting_selected_features = set()

    least_selected_features_list = list(least_selected_features | boosting_selected_features)

    print(least_selected_features)

# TEST PREDICTION

In [60]:
# 再学習したものを使い場合

df_subm['target'] = final_estimator.predict_proba(df_test[feature_cols])[:, 1]

df_subm.to_csv('submission.csv')
df_subm.head()

Unnamed: 0_level_0,target
isic_id,Unnamed: 1_level_1
ISIC_0015657,0.274951
ISIC_0015729,0.218071
ISIC_0015740,0.249403
