In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# config
config = {
    "data_path": "../",
    "model": {
        "encoder_name": "timm-resnest26d",
        "loss_smooth": 1.0,
        "optimizer_params": {"lr": 0.003, "weight_decay": 0.0},
        "scheduler": {
            "name": "CosineAnnealingLR",
            "params": {
                "CosineAnnealingLR": {"T_max": 500, "eta_min": 1e-06, "last_epoch": -1},
                "ReduceLROnPlateau": {
                    "factor": 0.316,
                    "mode": "min",
                    "patience": 4,
                    "verbose": True,
                },
            },
        },
        "seg_model": "Unet",
    },
    "output_dir": "models",
    "progress_bar_refresh_rate": 50,
    "seed": 42,
    "train_bs": 128,
    "trainer": {
        "enable_progress_bar": True,
        "max_epochs": 30,
        "min_epochs": 10,
        "accelerator": "mps",
        "devices": 1,
    },
    "valid_bs": 128,
    "workers": 0,
    "device": "mps",
    "folds": {
        "n_splits": 4,
        "random_state": 42,
        "train_folds": [0, 1, 2, 3]
    }
}

import torch
import numpy as np
import torchvision.transforms as T

class ContrailsDataset(torch.utils.data.Dataset):
    def __init__(self, df, image_size=256, train=True):

        self.df = df
        self.trn = train
        self.normalize_image = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        self.image_size = image_size
        if image_size != 256:
            self.resize_image = T.transforms.Resize(image_size)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        con_path = row.path
        con = np.load(str(con_path))

        img = con[..., :-1]
        label = con[..., -1]

        label = torch.tensor(label)

        img = torch.tensor(np.reshape(img, (256, 256, 3))).to(torch.float32).permute(2, 0, 1)

        if self.image_size != 256:
            img = self.resize_image(img)

        img = self.normalize_image(img)

        return img.float(), label.float()

    def __len__(self):
        return len(self.df)

In [5]:
import warnings
import gc

warnings.filterwarnings("ignore")

import os
import torch
import pandas as pd
import lightning.pytorch as pl
from pprint import pprint
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from torch import mps

contrails = os.path.join(config["data_path"], "contrails/")
data_path = os.path.join(config["data_path"], "data_df.csv")

data_df = pd.read_csv(data_path)

kf = KFold(n_splits=config["folds"]["n_splits"], shuffle=True, random_state=config["seed"])

for fold, (train_index, valid_index) in enumerate(kf.split(data_df)):
    train_df = data_df.iloc[train_index]
    valid_df = data_df.iloc[valid_index]

    dataset_train = ContrailsDataset(train_df, train=True)
    dataset_validation = ContrailsDataset(valid_df, train=False)

    data_loader_train = DataLoader(
        dataset_train, batch_size=config["train_bs"], shuffle=True, num_workers=config["workers"]
    )
    data_loader_validation = DataLoader(
        dataset_validation, batch_size=config["valid_bs"], shuffle=False, num_workers=config["workers"]
    )

    pl.seed_everything(config["seed"])

    filename = f"model_fold_{fold}"

    checkpoint_callback = ModelCheckpoint(
        monitor="val_iou",
        dirpath=config["output_dir"],
        mode="max",
        filename=filename,
        save_top_k=1,
        verbose=1,
    )

    progress_bar_callback = TQDMProgressBar(refresh_rate=config["progress_bar_refresh_rate"])

    early_stop_callback = EarlyStopping(monitor="val_loss", mode="min", patience=5, verbose=1)

    trainer = pl.Trainer(
        callbacks=[checkpoint_callback, early_stop_callback, progress_bar_callback], logger=None, **config["trainer"]
    )

    model = LightningModule(config["model"])

    trainer.fit(model, data_loader_train, data_loader_validation)
    torch.save(model.state_dict(), os.path.join(config["output_dir"], f"{filename}.pth"))
    model = model.to('cpu')
    mps.empty_cache()
    del (
        dataset_train,
        dataset_validation,
        train_df,
        valid_df,
        data_loader_train,
        data_loader_validation,
        model,
        trainer,
        checkpoint_callback,
        progress_bar_callback,
        early_stop_callback,
    )
    mps.empty_cache()
    gc.collect()

Global seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type     | Params
-----------------------------------------
0 | model       | Unet     | 24.0 M
1 | loss_module | DiceLoss | 0     
-----------------------------------------
24.0 M    Trainable params
0         Non-trainable params
24.0 M    Total params
96.134    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.446
Epoch 0, global step 132: 'val_iou' reached 0.39079 (best 0.39079), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_0-v11.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.027 >= min_delta = 0.0. New best score: 0.419
Epoch 1, global step 264: 'val_iou' reached 0.41666 (best 0.41666), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_0-v11.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.040 >= min_delta = 0.0. New best score: 0.379
Epoch 2, global step 396: 'val_iou' reached 0.45625 (best 0.45625), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_0-v11.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.373
Epoch 3, global step 528: 'val_iou' reached 0.46259 (best 0.46259), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_0-v11.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 660: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 792: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 924: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1056: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.373. Signaling Trainer to stop.
Epoch 8, global step 1188: 'val_iou' was not in top 1
Trainer was signaled to stop but the required `min_epochs=10` or `min_steps=None` has not been met. Training will continue...


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.010 >= min_delta = 0.0. New best score: 0.363
Epoch 9, global step 1320: 'val_iou' reached 0.47186 (best 0.47186), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_0-v11.ckpt' as top 1
Global seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type     | Params
-----------------------------------------
0 | model       | Unet     | 24.0 M
1 | loss_module | DiceLoss | 0     
-----------------------------------------
24.0 M    Trainable params
0         Non-trainable params
24.0 M    Total params
96.134    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.459
Epoch 0, global step 132: 'val_iou' reached 0.38010 (best 0.38010), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_1-v3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.046 >= min_delta = 0.0. New best score: 0.413
Epoch 1, global step 264: 'val_iou' reached 0.42148 (best 0.42148), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_1-v3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.034 >= min_delta = 0.0. New best score: 0.379
Epoch 2, global step 396: 'val_iou' reached 0.45538 (best 0.45538), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_1-v3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: 0.374
Epoch 3, global step 528: 'val_iou' reached 0.46179 (best 0.46179), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_1-v3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 660: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 792: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 924: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1056: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.374. Signaling Trainer to stop.
Epoch 8, global step 1188: 'val_iou' was not in top 1
Trainer was signaled to stop but the required `min_epochs=10` or `min_steps=None` has not been met. Training will continue...


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.369
Epoch 9, global step 1320: 'val_iou' reached 0.46701 (best 0.46701), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_1-v3.ckpt' as top 1
Global seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type     | Params
-----------------------------------------
0 | model       | Unet     | 24.0 M
1 | loss_module | DiceLoss | 0     
-----------------------------------------
24.0 M    Trainable params
0         Non-trainable params
24.0 M    Total params
96.134    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.452
Epoch 0, global step 132: 'val_iou' reached 0.38837 (best 0.38837), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.054 >= min_delta = 0.0. New best score: 0.398
Epoch 1, global step 264: 'val_iou' reached 0.43768 (best 0.43768), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 0.378
Epoch 2, global step 396: 'val_iou' reached 0.45712 (best 0.45712), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.0. New best score: 0.374
Epoch 3, global step 528: 'val_iou' reached 0.46127 (best 0.46127), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_2.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 660: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 792: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 924: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1056: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.374. Signaling Trainer to stop.
Epoch 8, global step 1188: 'val_iou' was not in top 1
Trainer was signaled to stop but the required `min_epochs=10` or `min_steps=None` has not been met. Training will continue...


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.012 >= min_delta = 0.0. New best score: 0.362
Epoch 9, global step 1320: 'val_iou' reached 0.47313 (best 0.47313), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_2.ckpt' as top 1
Global seed set to 42
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name        | Type     | Params
-----------------------------------------
0 | model       | Unet     | 24.0 M
1 | loss_module | DiceLoss | 0     
-----------------------------------------
24.0 M    Trainable params
0         Non-trainable params
24.0 M    Total params
96.134    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.672
Epoch 0, global step 132: 'val_iou' reached 0.20355 (best 0.20355), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.257 >= min_delta = 0.0. New best score: 0.415
Epoch 1, global step 264: 'val_iou' reached 0.42084 (best 0.42084), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.032 >= min_delta = 0.0. New best score: 0.383
Epoch 2, global step 396: 'val_iou' reached 0.45298 (best 0.45298), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: 0.377
Epoch 3, global step 528: 'val_iou' reached 0.45847 (best 0.45847), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_3.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 660: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 792: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 924: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 1056: 'val_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.377. Signaling Trainer to stop.
Epoch 8, global step 1188: 'val_iou' was not in top 1
Trainer was signaled to stop but the required `min_epochs=10` or `min_steps=None` has not been met. Training will continue...


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.013 >= min_delta = 0.0. New best score: 0.364
Epoch 9, global step 1320: 'val_iou' reached 0.47118 (best 0.47118), saving model to '/Users/johnny/Library/CloudStorage/OneDrive-Personal/py/Kaggle/contrails/notebooks/models/model_fold_3.ckpt' as top 1


In [6]:
# save the model
torch.save(model.state_dict(), 'models/model.pt')

NameError: name 'model' is not defined

In [None]:
data_df = pd.concat([train_df, valid_df])
data_df = data_df.reset_index(drop=True)
data_df.to_csv("../data_df.csv", index=False)

In [None]:
torch.mps.driver_allocated_memory()