In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'isic-2024-challenge:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F63056%2F9094797%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240914%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240914T183249Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D80c1f5bad0ed7dba60502afe139e18d6291799baf89bd24477bfb10726867710c0867d0f5ff3a78fa03ecceb07c48dee859738f2c541a7e89cc837155839e0b3cc27ddbc1b7a65f68a133e1ac808be5febb5a67fcd634150cbec92d70b19d87adf59636b3155f9de99e882d534a2b93ecd21c5ff0d49263d25bfacfc33444e4e6b50b53917a4fb66a48ae154d6ec04e9c76b06aa573869eb19465ac57728ae685bb7d85e9bb3285b47180cc85e45145aaea82fc2e9e4667a984aa5c32cdf479032210ebccbab2b9911f6051cb9d40d1062f46e7c2b94e3cd39639dc4d58a14703afe55ea208af1e0875b8e9a43acc88a1b11c1a37e8cdadd0402239bd811a012'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


### Packages

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import os
import optuna
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer,MissingIndicator,KNNImputer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.ensemble import VotingClassifier
from tqdm.notebook import tqdm
from sklearn.metrics import roc_curve, auc,accuracy_score,f1_score,precision_score,recall_score,confusion_matrix
import random

### Global

In [None]:
SEED = 42
TRAIN_PATH = "/kaggle/input/isic-2024-challenge/train-metadata.csv"
TEST_PATH = "/kaggle/input/isic-2024-challenge/test-metadata.csv"
MODE = "submit"

In [None]:
random.seed(SEED)
np.random.seed(SEED)

### Utils

In [None]:
def create_preprocessors(X : pd.DataFrame, y : pd.DataFrame) -> list[Pipeline]:

    p1 = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('imputing', SimpleImputer(strategy='median'),X.select_dtypes(include=np.number).columns),
            ('encoding', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1,encoded_missing_value=-2), X.select_dtypes(include=object).columns),
        ],remainder='passthrough'))
    ])

    p2 = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('imputing', FeatureUnion([
                ('simple-imputer',SimpleImputer(strategy='median')),
                ('missing-indicator',MissingIndicator(sparse=False))
            ]),X.select_dtypes(include=np.number).columns),
            ('encoding', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1,encoded_missing_value=-2), X.select_dtypes(include=object).columns),
        ],remainder='passthrough'))
    ])

    p3 = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('imputing', SimpleImputer(strategy='median'),X.select_dtypes(include=np.number).columns),
            ('encoding', OneHotEncoder(sparse_output=False,handle_unknown="ignore",drop='first'), X.select_dtypes(include=object).columns),
        ],remainder='passthrough'))
    ])

    p4 = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('imputing', FeatureUnion([
                ('simple-imputer',SimpleImputer(strategy='median')),
                ('missing-indicator',MissingIndicator(sparse=False))
            ]),X.select_dtypes(include=np.number).columns),
            ('encoding', OneHotEncoder(sparse_output=False,handle_unknown="ignore",drop='first'), X.select_dtypes(include=object).columns),
        ],remainder='passthrough'))
    ])

    return [p1,p2,p3,p4]

In [None]:
def feature_engineering(df : pd.DataFrame) -> pd.DataFrame:

    eps = 1e-6

    df["color_variance_ratio"]           = df["tbp_lv_color_std_mean"] / (df["tbp_lv_stdLExt"] + eps)
    df["border_color_interaction"]       = df["tbp_lv_norm_border"] * df["tbp_lv_norm_color"]
    df["size_color_contrast_ratio"]      = df["clin_size_long_diam_mm"] / (df["tbp_lv_deltaLBnorm"] + eps)
    df["age_normalized_nevi_confidence"] = df["tbp_lv_nevi_confidence"] / (df["age_approx"] + eps)
    df["color_asymmetry_index"]          = df["tbp_lv_radial_color_std_max"] * df["tbp_lv_symm_2axis"]
    df["3d_volume_approximation"]        = df["tbp_lv_areaMM2"] * np.sqrt(df["tbp_lv_x"]**2 + df["tbp_lv_y"]**2 + df["tbp_lv_z"]**2)
    df["color_range"]                    = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs() + (df["tbp_lv_A"] - df["tbp_lv_Aext"]).abs() + (df["tbp_lv_B"] - df["tbp_lv_Bext"]).abs()
    df["shape_color_consistency"]        = df["tbp_lv_eccentricity"] * df["tbp_lv_color_std_mean"]
    df["border_length_ratio"]            = df["tbp_lv_perimeterMM"] / (2 * np.pi * np.sqrt(df["tbp_lv_areaMM2"] / np.pi))
    df["age_size_symmetry_index"]        = df["age_approx"] * df["clin_size_long_diam_mm"] * df["tbp_lv_symm_2axis"]
    df["age_size_symmetry_index2"]       = df["age_approx"] * df["tbp_lv_areaMM2"] * df["tbp_lv_symm_2axis"]

    df["lesion_size_ratio"]              = df["tbp_lv_minorAxisMM"] / (df["clin_size_long_diam_mm"] + eps)
    df["lesion_shape_index"]             = df["tbp_lv_areaMM2"] / (df["tbp_lv_perimeterMM"] ** 2 + eps)
    df["hue_contrast"]                   = (df["tbp_lv_H"] - df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"]             = (df["tbp_lv_L"] - df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"]        = np.sqrt(df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2)
    df["border_complexity"]              = df["tbp_lv_norm_border"] + df["tbp_lv_symm_2axis"]
    df["color_uniformity"]               = df["tbp_lv_color_std_mean"] / (df["tbp_lv_radial_color_std_max"] + eps)
    df["3d_position_distance"]           = np.sqrt(df["tbp_lv_x"] ** 2 + df["tbp_lv_y"] ** 2 + df["tbp_lv_z"] ** 2)
    df["perimeter_to_area_ratio"]        = df["tbp_lv_perimeterMM"] / (df["tbp_lv_areaMM2"] + eps)
    df["lesion_visibility_score"]        = df["tbp_lv_deltaLBnorm"] + df["tbp_lv_norm_color"]
    df["combined_anatomical_site"]       = df["anatom_site_general"] + "_" + df["tbp_lv_location"]
    df["symmetry_border_consistency"]    = df["tbp_lv_symm_2axis"] * df["tbp_lv_norm_border"]
    df["color_consistency"]              = df["tbp_lv_stdL"] / (df["tbp_lv_Lext"] + eps)

    df["size_age_interaction"]           = df["clin_size_long_diam_mm"] * df["age_approx"]
    df["hue_color_std_interaction"]      = df["tbp_lv_H"] * df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"]          = (df["tbp_lv_norm_border"] + df["tbp_lv_norm_color"] + df["tbp_lv_eccentricity"]) / 3
    df["shape_complexity_index"]         = df["border_complexity"] + df["lesion_shape_index"]
    df["color_contrast_index"]           = df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"] + df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"]                = np.log(df["tbp_lv_areaMM2"] + 1)
    df["normalized_lesion_size"]         = df["clin_size_long_diam_mm"] / df["age_approx"]
    df["mean_hue_difference"]            = (df["tbp_lv_H"] + df["tbp_lv_Hext"]) / 2
    df["std_dev_contrast"]               = np.sqrt((df["tbp_lv_deltaA"] ** 2 + df["tbp_lv_deltaB"] ** 2 + df["tbp_lv_deltaL"] ** 2) / 3)
    df["color_shape_composite_index"]    = (df["tbp_lv_color_std_mean"] + df["tbp_lv_area_perim_ratio"] + df["tbp_lv_symm_2axis"]) / 3
    df["3d_lesion_orientation"]          = np.arctan2(df["tbp_lv_y"], df["tbp_lv_x"])
    df["overall_color_difference"]       = (df["tbp_lv_deltaA"] + df["tbp_lv_deltaB"] + df["tbp_lv_deltaL"]) / 3
    df["symmetry_perimeter_interaction"] = df["tbp_lv_symm_2axis"] * df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"]     = (df["tbp_lv_area_perim_ratio"] + df["tbp_lv_eccentricity"] + df["tbp_lv_norm_color"] + df["tbp_lv_symm_2axis"]) / 4

    return df

In [None]:
def preprare_data(df : pd.DataFrame) -> tuple[dict,list]:

    target_col = 'target'

    df = feature_engineering(df)

    X = df.drop(columns=[target_col,'patient_id'])
    y = df[target_col]

    preprocessors = create_preprocessors(X,y)

    for i,p in enumerate(preprocessors):
        p.fit(X, y)

    data = dict()

    gfk = GroupKFold(n_splits=5)

    for fold,(train_idx,val_idx) in tqdm(enumerate(gfk.split(df,groups=df['patient_id'])), total=5):

        train = df.loc[train_idx]
        val = df.loc[val_idx]

        X_train = train.drop([target_col,'patient_id'],axis=1)
        y_train = train[target_col]
        X_val = val.drop([target_col,'patient_id'],axis=1)
        y_val = val[target_col]

        for i,p in enumerate(preprocessors):

            if f'preprocessing-{i}' not in data:
                data[f'preprocessing-{i}'] = []

            X_train_ = p.transform(X_train)
            X_val_ = p.transform(X_val)

            data[f'preprocessing-{i}'].append({
                'train' : (X_train_,y_train.values),
                'val' : (X_val_,y_val.values)
            })

    return data,preprocessors

In [None]:
def get_xgb_data(data : dict):

    xgb_data = dict()

    for dataset in data:

        xgb_data[dataset] = []

        for fold in data[dataset]:

            train_data = fold['train']
            val_data = fold['val']

            X_train,y_train = train_data
            X_val,y_val = val_data

            dtrain = xgb.DMatrix(X_train,y_train)
            dval = xgb.DMatrix(X_val,y_val)

            xgb_data[dataset].append((dtrain,dval))

    return xgb_data

In [None]:
def train_xgb(data : list, params : dict, model_type=lgb.LGBMClassifier) -> list:

    models = []

    for fold in tqdm(range(5),total=5):

        dtrain,dval = data[fold]

        model = xgb.train(params, dtrain)

        models.append(model)

    return models

In [None]:
def predict(data : list, models : list) -> list:

    preds = []

    for fold in tqdm(range(5),total=5):

        dtrain,dval = data[fold]

        model = models[fold]

        pred = model.predict(dval)

        preds.append(preds)

    return preds

In [None]:
def pauc(solution: np.array, submission: np.array, min_tpr : float = 0.8) -> float:
    v_gt = abs(np.asarray(solution)-1)

    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission)

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)

    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)

    return(partial_auc)

### Data preparation

In [None]:
isic_cols = ['age_approx','sex','anatom_site_general','clin_size_long_diam_mm','tbp_lv_A']
isic_cols += ['tbp_lv_Aext','tbp_lv_B','tbp_lv_Bext','tbp_lv_C','tbp_lv_Cext','tbp_lv_H','tbp_lv_Hext','tbp_lv_L']
isic_cols += ['tbp_lv_Lext','tbp_lv_areaMM2','tbp_lv_area_perim_ratio','tbp_lv_color_std_mean','tbp_lv_deltaA']
isic_cols += ['tbp_lv_deltaB','tbp_lv_deltaL','tbp_lv_deltaLB','tbp_lv_deltaLBnorm','tbp_lv_eccentricity','tbp_lv_location']
isic_cols += ['tbp_lv_minorAxisMM','tbp_lv_nevi_confidence','tbp_lv_norm_border','tbp_lv_norm_color']
isic_cols += ['tbp_lv_perimeterMM','tbp_lv_radial_color_std_max','tbp_lv_stdL','tbp_lv_stdLExt','tbp_lv_symm_2axis']
isic_cols += ['tbp_lv_symm_2axis_angle','tbp_lv_x','tbp_lv_y','tbp_lv_z','tbp_lv_location_simple']
isic_cols += ['target','patient_id']

In [None]:
df = pd.read_csv(TRAIN_PATH)
df = df[isic_cols]

In [None]:
data,preorocessors = preprare_data(df)

In [None]:
xgb_data = get_xgb_data(data)

### Training

In [None]:
top_k_xgb_models = {}
k = 5

In [None]:
def xgb_objective(trial : optuna.Trial):

    params = {
        "seed" : SEED,
        "device" : "gpu",
        "verbosity" : 0,
    }

    params["n_estimators"] = trial.suggest_int("n_estimators", 100, 2000)

    params["objective"] = "binary:logistic"

    params["eval_metric"] = "logloss"

    params["eta"] = trial.suggest_float("eta", 0.01, 0.3)

    params["max_depth"] = trial.suggest_int("max_depth", 3, 16)

    params["subsample"] = trial.suggest_float("subsample", 0.4, 1.0)

    params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.4, 1.0)

    params["scale_pos_weight"] = trial.suggest_float("scale_pos_weight", 1.0, 1200.0)

    params["min_child_weight"] = trial.suggest_int("min_child_weight", 1, 500)

    params["gamma"] = trial.suggest_float("gamma", 0.0, 10.0)

    params["max_delta_step"] = trial.suggest_int("max_delta_step", 0, 10)

    params["lambda"] = trial.suggest_float("lambda", 0.0, 10.0)

    preprocessing = trial.suggest_int("preprocessing", 0, 3)

    scores = []

    for fold in xgb_data[f'preprocessing-{preprocessing}']:

        dtrain,dval = fold
        model = xgb.train(params,dtrain)
        y_hat = model.predict(dval)

        scores.append(pauc(dval.get_label(), y_hat))

    mean = np.mean(scores)

    params["preprocessing"] = preprocessing

    if len(top_k_xgb_models) < k:
        top_k_xgb_models[mean] = params
    else:
        min_score = min(top_k_xgb_models.keys())
        if mean > min_score:
            del top_k_xgb_models[min_score]
            top_k_xgb_models[mean] = params

    return mean

In [None]:
study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler(seed=SEED))

In [None]:
if MODE == "train":
    study.optimize(xgb_objective, n_trials=600, show_progress_bar=True)

In [None]:
params = [{'seed': 42,
  'device': 'gpu',
  'verbosity': 0,
  'n_estimators': 157,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'eta': 0.25476128425854655,
  'max_depth': 5,
  'subsample': 0.6787811038410864,
  'colsample_bytree': 0.5155976119504401,
  'scale_pos_weight': 288.95074112607983,
  'min_child_weight': 363,
  'gamma': 1.0647069022702935,
  'max_delta_step': 5,
  'lambda': 1.8799956112165466,
  'preprocessing': 3},
 {'seed': 42,
  'device': 'gpu',
  'verbosity': 0,
  'n_estimators': 328,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'eta': 0.2567867140407475,
  'max_depth': 5,
  'subsample': 0.702344584414856,
  'colsample_bytree': 0.5121079529423151,
  'scale_pos_weight': 237.6148071259882,
  'min_child_weight': 346,
  'gamma': 0.6633876786270412,
  'max_delta_step': 6,
  'lambda': 1.7573365079227086,
  'preprocessing': 3},
 {'seed': 42,
  'device': 'gpu',
  'verbosity': 0,
  'n_estimators': 280,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'eta': 0.2610742654280035,
  'max_depth': 5,
  'subsample': 0.725405910803676,
  'colsample_bytree': 0.4963538314984176,
  'scale_pos_weight': 221.56384344083796,
  'min_child_weight': 332,
  'gamma': 0.19317214348960077,
  'max_delta_step': 6,
  'lambda': 2.0035140473205955,
  'preprocessing': 3},
 {'seed': 42,
  'device': 'gpu',
  'verbosity': 0,
  'n_estimators': 175,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'eta': 0.25976109114341067,
  'max_depth': 5,
  'subsample': 0.7077302661810733,
  'colsample_bytree': 0.4989437136265675,
  'scale_pos_weight': 224.38554946447817,
  'min_child_weight': 339,
  'gamma': 0.02363121329124465,
  'max_delta_step': 6,
  'lambda': 1.7077803662151292,
  'preprocessing': 3},
 {'seed': 42,
  'device': 'gpu',
  'verbosity': 0,
  'n_estimators': 1551,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'eta': 0.2544223456389429,
  'max_depth': 5,
  'subsample': 0.7162855240498472,
  'colsample_bytree': 0.505492977416504,
  'scale_pos_weight': 194.58222305828355,
  'min_child_weight': 347,
  'gamma': 0.941582009184013,
  'max_delta_step': 6,
  'lambda': 1.5043238438058508,
  'preprocessing': 3}]

In [None]:
def create_models(
    params : list[dict],
    data : dict[str,list[tuple[xgb.DMatrix,xgb.DMatrix]]]
):

    models = []

    for param in tqdm(params):

        preprocessing = param['preprocessing']
        del param['preprocessing']

        folds = data[f'preprocessing-{preprocessing}']

        models.append([])

        for dtrain,dval in folds:

            model = xgb.train(param, dtrain)
            models[-1].append(model)

        param['preprocessing'] = preprocessing

    return models

In [None]:
models = create_models(params, xgb_data)

In [None]:
def predict(
    params : list[dict],
    models : list[xgb.Booster],
    data : pd.DataFrame,
    preprocessors : list[Pipeline]
):
    preds = []

    for i,param in enumerate(params):

        preprocessing = param['preprocessing']
        preprocessor = preprocessors[preprocessing]
        models_grp = models[i]

        X = preprocessor.transform(data)
        X = xgb.DMatrix(X)

        for model in models_grp:
            y_hat = model.predict(X)
            preds.append(y_hat)

    return preds

### Submission

In [None]:
isic_cols.pop(isic_cols.index('patient_id'))

In [None]:
isic_cols.pop(isic_cols.index('target'))

In [None]:
def is_number(x):
    return isinstance(x, int) or isinstance(x, float) or isinstance(x, np.number)

def adapt(test : pd.DataFrame, train : pd.DataFrame) -> pd.DataFrame:

    numerical_columns = train.select_dtypes(np.number).columns
    categorical_columns = train.select_dtypes(object).columns

    test = test.copy()

    for col in numerical_columns:
        test[col] = test[col].map(lambda x : x if is_number(x) else np.nan).astype(float)

    for col in categorical_columns:
         test[col] = test[col].astype(object)

    return test

In [None]:
test_df = pd.read_csv(TEST_PATH)
train_df = pd.read_csv(TRAIN_PATH)[test_df.columns]
test_df = adapt(test_df, train_df)
isic_id = test_df['isic_id']
test_df = test_df[isic_cols]

In [None]:
test_df = feature_engineering(test_df)
test_df = test_df.replace(np.inf, 1e6).replace(-np.inf,-1e6).replace(np.nan, 0.0)

In [None]:
preds = predict(params,models,test_df,preorocessors)

In [None]:
preds = np.vstack(preds).mean(axis=0)

In [None]:
df = pd.DataFrame({
    'isic_id' : isic_id,
    'target' : preds
})

In [None]:
df.head()

In [None]:
df.to_csv('/kaggle/working/submission.csv',index=False)

In [None]:
!head /kaggle/working/submission.csv