In [None]:
# !pip install timm==1.0.9
# !pip install albumentations==1.4.14
# !pip install torcheval==0.0.7
# !pip install pandas==2.2.2
# !pip install numpy==1.26.4

In [None]:
import sys, os, time, copy, gc
import torch
from torch import nn
from torch.utils.data import DataLoader
from pathlib import Path

import numpy as np
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2
import multiprocessing as mp

from torcheval.metrics.functional import binary_auroc, multiclass_auroc

import hashlib
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split

from PIL import Image
import torch.optim as optim

from collections import defaultdict




sys.path.append('../src')
from utils import set_seed, visualize_augmentations_positive, print_trainable_parameters
from models import setup_model
from training import fetch_scheduler, train_one_epoch, valid_one_epoch
from models import ISICModel, ISICModelEdgnet
from datasets import ISICDatasetSamplerW, ISICDatasetSampler, ISICDatasetSimple, ISICDatasetSamplerMulticlass
from augmentations import get_augmentations

In [None]:
# Set up device and random seed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")

In [None]:
model_dir = "../models/pretraining"
str_model_name = "ema_small_pretrained_medium"
os.makedirs(model_dir, exist_ok=True)

In [None]:
CONFIG = {
    "seed": 42, #42 33
    "epochs": 500,
    "img_size": 336, #336,
    "train_batch_size": 32,
    "valid_batch_size": 64,
    "learning_rate": 1e-4,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 2000,
    "weight_decay": 1e-6,
    "fold" : 0,
    "n_fold": 5,
    "n_accumulate": 1,
    "group_col": 'patient_id',
    "device": device
}

model_name = "eva02_small_patch14_336.mim_in22k_ft_in1k"
checkpoint_path = None

In [None]:
original_data_path = "../data/original"
original_root = Path('../data/original')

data_artifacts = "../data/artifacts"
os.makedirs(data_artifacts, exist_ok=True)

In [None]:
TRAIN_HDF5_FILE_PATH = original_root / 'train-image.hdf5'

train_path = original_root / 'train-metadata.csv'
df_train = pd.read_csv(train_path)
df_train["path"] = '../data/original/train-image/image/' + df_train['isic_id'] + ".jpg"
original_positive_cases = df_train['target'].sum()
original_total_cases = len(df_train)
original_positive_ratio = original_positive_cases / original_total_cases

print(f"Number of positive cases: {original_positive_cases}")
print(f"Number of negative cases: {original_total_cases - original_positive_cases}")
print(f"Ratio of negative to positive cases: {(original_total_cases - original_positive_cases) / original_positive_cases:.2f}:1")

In [None]:
data_transforms = get_augmentations(CONFIG)

In [None]:
aug_transform_base = A.Compose([
    A.Resize(CONFIG['img_size'], CONFIG['img_size']),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

aug_transform = A.Compose([
    A.RandomRotate90(),
    A.Flip(),
    A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.1, p=0.5),
    A.Resize(CONFIG['img_size'], CONFIG['img_size']),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])


augtest_dataset = ISICDatasetSampler(
    meta_df=df_train,
    # transforms=aug_transform_base,
    do_augmentations=True,
    transforms=data_transforms['train'] # look to extreme sometimes but works quite good
)

# visualize_augmentations_positive(augtest_dataset, transforms=aug_transform,)

In [None]:
metadata_df = pd.read_csv("../images/metadata.csv")
metadata_df['diagnosis_pr'] = metadata_df.diagnosis.map({
    'nevus': 'nevus',
    'melanoma': 'melanoma',
    'basal cell carcinoma': 'bkl',
    'seborrheic keratosis': 'bkl',
    'solar lentigo': 'bkl',
    'lentigo NOS': 'bkl',
    'lentigo NOS': 'bkl'
})
mask = (metadata_df.benign_malignant == 'benign') & (metadata_df.diagnosis_pr != 'bkl')
metadata_df.loc[mask, 'diagnosis_pr'] = 'nevus'
metadata_df["path"] = "../images/" + metadata_df['isic_id'] + ".jpg"

In [None]:
def get_hash(file_name):
    image_tmp = Image.open(file_name)
    md5hash = hashlib.md5(image_tmp.tobytes()).hexdigest()
    return str(md5hash)

def get_has_df(df):
    image_hash = []
    for _, row in df.iterrows():
        image_hash.append(get_hash(row.path))
    
    return pd.DataFrame({
        "path": df.path,
        "image_hash": image_hash
    })

In [None]:
def resize_image(image, resize=512):
    w, h =  image.size

    if h < w:
        h_new = resize
        w_new = int(h_new / h * w // 8 * 8)
    else:
        w_new = resize
        h_new = int(w_new / w * h // 8 * 8)

    image = image.resize((w_new, h_new))
    return image

def resize_images(df, path, size_thr = 512):
    for _, row in df.iterrows():
        img = Image.open(row.path)
        w, h = img.size

        if min(w, h) > size_thr:
            img = resize_image(img, resize=size_thr)
        img.save(os.path.join(path, row.isic_id + ".png"))

In [None]:
hash_df = Parallel(n_jobs=mp.cpu_count())(delayed(get_has_df)(df)
    for df in np.array_split(metadata_df, mp.cpu_count()*2))
hash_df = pd.concat(hash_df).reset_index(drop=True)

metadata_df = metadata_df.merge(
    hash_df, how="left", on=["path"]
)
metadata_df = metadata_df.groupby('image_hash').first().reset_index(drop=True)

metadata_df["diagnosis_pr_target"] = metadata_df.diagnosis_pr.map({
    "nevus": 0,
    "bkl": 1,
    "melanoma": 2
})
metadata_df = metadata_df[~metadata_df.diagnosis_pr.isna()].reset_index(drop=True)
metadata_df = metadata_df.rename(columns={
    'diagnosis_pr_target': 'target'
})

In [None]:
resized_path = "../external_images_resized"
os.makedirs(resized_path, exist_ok=True)

Parallel(n_jobs=mp.cpu_count())(delayed(resize_images)(df, resized_path)
    for df in np.array_split(metadata_df, mp.cpu_count()*2));

In [None]:
metadata_df['path'] = resized_path + '/' + metadata_df['isic_id'] + '.png'
metadata_df = metadata_df[
    metadata_df['path'].apply(lambda x: os.path.exists(x))
].reset_index(drop=True)

In [None]:
metadata_df.shape

In [None]:
train_pretrain_df, val_pretrain_df = train_test_split(
    metadata_df, test_size=0.2, shuffle=True, stratify=metadata_df.target, random_state=CONFIG['seed'])

In [None]:
train_dataset = ISICDatasetSamplerMulticlass(
    train_pretrain_df, transforms=data_transforms["train"], process_target=True, n_classes=3)
valid_dataset = ISICDatasetSimple(
    val_pretrain_df, transforms=data_transforms["valid"], process_target=True, n_classes=3)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                          num_workers=10, shuffle=True, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=10, shuffle=False, pin_memory=True)

In [None]:
model = setup_model(model_name, num_classes=3, device=device)
print_trainable_parameters(model)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer, CONFIG)

In [None]:
def criterion_mc(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)

get_nth_test_step = lambda x: 1

def run_training_pretrain(
        train_loader, valid_loader, model, optimizer, scheduler, device, num_epochs, 
        model_folder=None, model_name="", seed=42, tolerance_max=15, criterion=criterion_mc, test_every_nth_step=get_nth_test_step):
    set_seed(seed)
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_score = -np.inf
    history = defaultdict(list)
    tolerance = 0
    for epoch in range(1, num_epochs + 1): 
        test_every_nth_step = get_nth_test_step(epoch)
        if tolerance > tolerance_max:
            break
        gc.collect()
        train_epoch_loss, train_epoch_auroc = train_one_epoch(
            model, 
            optimizer, 
            scheduler, 
            dataloader=train_loader, 
            device=CONFIG['device'],
            CONFIG=CONFIG,
            epoch=epoch, 
            criterion=criterion,
            metric_function=multiclass_auroc, 
            num_classes=3)

        if epoch % test_every_nth_step == 0:
            val_epoch_loss, val_epoch_auroc, val_epoch_custom_metric = valid_one_epoch(
                model, 
                valid_loader, 
                device=CONFIG['device'], 
                epoch=epoch, 
                optimizer=optimizer, 
                criterion=criterion, 
                use_custom_score=False,
                metric_function=multiclass_auroc, 
                num_classes=3)
        
            history['Train Loss'].append(train_epoch_loss)
            history['Valid Loss'].append(val_epoch_loss)
            history['Train AUROC'].append(train_epoch_auroc)
            history['Valid AUROC'].append(val_epoch_auroc)
            history['Valid Kaggle metric'].append(val_epoch_custom_metric)
            history['lr'].append( scheduler.get_lr()[0] )
            
            if best_epoch_score <= val_epoch_auroc:
                tolerance = 0
                print(f"Validation AUROC Improved ({best_epoch_score} ---> {val_epoch_auroc})")
                best_epoch_score = val_epoch_auroc
                best_model_wts = copy.deepcopy(model.state_dict())
                if model_folder is not None:
                    torch.save(model.state_dict(), os.path.join(model_folder, model_name))
            else:
                tolerance += 1
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best AUROC: {:.4f}".format(best_epoch_score))    
    model.load_state_dict(best_model_wts)
    return model, history

In [None]:
model, history = run_training_pretrain(
    train_loader, valid_loader, 
    model, optimizer, scheduler,
    device=CONFIG['device'],
    num_epochs=CONFIG['epochs'],
    criterion=criterion_mc)

In [None]:
torch.save(model.state_dict(), os.path.join(model_dir, str_model_name))

In [None]:
df_train_dataset = ISICDatasetSimple(df_train, transforms=data_transforms["valid"], process_target=True, n_classes=3)
df_train_loader = DataLoader(df_train_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=5, shuffle=False, pin_memory=True)

In [None]:
def criterion(outputs, targets):
    return nn.BCELoss()(outputs, targets)

In [None]:
val_epoch_loss, val_epoch_auroc, val_epoch_custom_metric, tmp_predictions_all, tmp_targets_all = valid_one_epoch(
    model, 
    df_train_loader, 
    device=CONFIG['device'], 
    epoch=1, 
    optimizer=optimizer, 
    criterion=criterion, 
    use_custom_score=False,
    metric_function=multiclass_auroc, 
    num_classes=3,
    return_preds=True)

df_train['old_set_0'] = tmp_predictions_all[:, 0]
df_train['old_set_1'] = tmp_predictions_all[:, 1]
df_train['old_set_2'] = tmp_predictions_all[:, 2]

In [None]:
df_train[['isic_id', 'old_set_0', 'old_set_1', 'old_set_2']].to_parquet('../data/artifacts/old_data_model_forecast_large.parquet')