# Semantic Segmentation to Identify Cloud Cover in Satellite Imagery
## PyTorch Implementation with Inception V4 Backbone Model

---
### Step 1: Import packages
---

In [1]:
%%capture
!pip install pandas_path pytorch_lightning cloudpathlib loguru typer wandb albumentations# added wandb
%load_ext autoreload
%autoreload 2

In [2]:
# Loading packages according to above may be problematic, reloading them below with an alternative method
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/
import sys
!{sys.executable} -m pip install pandas_path pytorch_lightning cloudpathlib loguru typer wandb albumentations



In [3]:
# Installing wandb can be difficult, you may need to 'pip install wandb' from the terminal 
!pip install wandb -qqq

In [None]:
import csv
import torch
import wandb
import pyproj
import shutil
import warnings
import rasterio
import rioxarray

import numpy as np
import pandas as pd
import albumentations as A
import pytorch_lightning as pl
import matplotlib.pyplot as plt
import xrspatial.multispectral as ms
import segmentation_models_pytorch as smp

from PIL import Image
from pathlib import Path
from pandas_path import path
from typing import Optional, List

from helper_functions import *

<h2><font color='red'>Important: Change project parameter of Weights & Biases (Wandblogger) below before every run<font></h2>

#### Step 1a (Optional): Log training to wandb.ai

In [6]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer

wandb_logger = WandbLogger(project="inception_v4", entity="w207-clouds")  # Change this project paramater before every run

---
### Step 2: Define working directories and global variables
---

In [7]:
DATA_DIR = Path("/driven-data/cloud-cover")
TRAIN_FEATURES = DATA_DIR / "train_features"
TRAIN_LABELS = DATA_DIR / "train_labels"

assert TRAIN_FEATURES.exists()

In [1]:
BANDS = ["B02", "B03", "B04", "B08"] # These are bandwidths at which satellite captured images

In [9]:
train_meta = pd.read_csv(DATA_DIR / "train_metadata.csv")
display(train_meta)

Unnamed: 0,chip_id,location,datetime,cloudpath
0,adwp,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adwp
1,adwu,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adwu
2,adwz,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adwz
3,adxp,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adxp
4,aeaj,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/aeaj
...,...,...,...,...
11743,zxwv,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxwv
11744,zxxo,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxxo
11745,zxym,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxym
11746,zxza,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxza


#### Step 2a: Define Classes

In [10]:
# create benchmark_src folder
submission_dir = Path("benchmark_src")
if submission_dir.exists():
    shutil.rmtree(submission_dir)

submission_dir.mkdir(parents=True)

Added `Albumentations` transforms in CloudDataset

In [11]:
%%file {submission_dir}/cloud_dataset.py
import numpy as np
import pandas as pd
import rasterio
import torch
from typing import Optional, List
import albumentations as A


class CloudDataset(torch.utils.data.Dataset):
    """Reads in images, transforms pixel values, and serves a
    dictionary containing chip ids, image tensors, and
    label masks (where available).
    """

    def __init__(
        self,
        x_paths: pd.DataFrame, # : syntax specifies datatype for each parameter for CloudDataset objects
        bands: List[str],
        y_paths: Optional[pd.DataFrame] = None,
        transforms: Optional[list] = None,
    ):
        """
        Instantiate the CloudDataset class.

        Args:
            x_paths (pd.DataFrame): a dataframe with a row for each chip. There must be a column for chip_id,
                and a column with the path to the TIF for each of bands
            bands (list[str]): list of the bands included in the data
            y_paths (pd.DataFrame, optional): a dataframe with a for each chip and columns for chip_id
                and the path to the label TIF with ground truth cloud cover
            transforms (list, optional): list of transforms to apply to the feature data (eg augmentations)
        """
        self.data = x_paths
        self.label = y_paths
        self.transforms = transforms
        self.bands = bands

    def __len__(self):
        return len(self.data)
    
    # Similar to pop, helps iterate through dataset
    def __getitem__(self, idx: int):
        # Loads an n-channel image from a chip-level dataframe
        img = self.data.loc[idx]
        band_arrs = []
        for band in self.bands:
            with rasterio.open(img[f"{band}_path"]) as b:
                band_arr = b.read(1).astype("float32")
            band_arrs.append(band_arr)
        x_arr = np.stack(band_arrs, axis=-1) # 3-dimensional array

        # Apply augmentations, if provided
        if self.transforms:
            # Subset each band's image to get 4 arrays that are each 512x512
            band_02_arr = x_arr[:, :, 0]
            band_03_arr = x_arr[:, :, 1]
            band_04_arr = x_arr[:, :, 2]
            band_08_arr = x_arr[:, :, 3]
            
            # Apply transform to each band's image
            band_02_arr_transformed = self.transforms(image=band_02_arr)["image"]
            band_03_arr_transformed = self.transforms(image=band_03_arr)["image"]
            band_04_arr_transformed = self.transforms(image=band_04_arr)["image"]
            band_08_arr_transformed = self.transforms(image=band_08_arr)["image"]
            
            # Recombine transformed images back into 512x512x4 ndarray
            x_arr = np.dstack((band_02_arr_transformed,
                               band_03_arr_transformed,
                               band_04_arr_transformed,
                               band_08_arr_transformed))
            
        # re-orders array to match expected format needed for model
        x_arr = np.transpose(x_arr, [2, 0, 1]) 

        # Prepare dictionary for item
        item = {"chip_id": img.chip_id, "chip": x_arr}
        
        # Spatial transforms are valid transforms to apply to label (unlike pixel transforms)
        spatial_transforms = [A.augmentations.geometric.transforms.Affine,
                    A.augmentations.crops.transforms.CenterCrop,
                    A.augmentations.transforms.CoarseDropout,
                    A.augmentations.crops.transforms.Crop,
                    A.augmentations.crops.transforms.CropAndPad,
                    A.augmentations.crops.transforms.CropNonEmptyMaskIfExists,
                    A.augmentations.geometric.transforms.ElasticTransform,
                    A.augmentations.transforms.Flip,
                    A.augmentations.transforms.GridDistortion,
                    A.augmentations.transforms.GridDropout,
                    A.augmentations.transforms.HorizontalFlip,
                    A.augmentations.transforms.Lambda,
                    A.augmentations.geometric.resize.LongestMaxSize,
                    A.augmentations.transforms.MaskDropout,
                    A.augmentations.transforms.NoOp,
                    A.augmentations.transforms.OpticalDistortion,
                    A.augmentations.transforms.PadIfNeeded,
                    A.augmentations.geometric.transforms.Perspective,
                    A.augmentations.geometric.transforms.PiecewiseAffine,
                    # A.augmentations.transforms.PixelDropout, # doesn't match docs for some reason
                    A.augmentations.crops.transforms.RandomCrop,
                    A.augmentations.crops.transforms.RandomCropNearBBox,
                    A.augmentations.transforms.RandomGridShuffle,
                    A.augmentations.crops.transforms.RandomResizedCrop,
                    A.augmentations.geometric.rotate.RandomRotate90,
                    A.augmentations.geometric.resize.RandomScale,
                    A.augmentations.crops.transforms.RandomSizedBBoxSafeCrop,
                    A.augmentations.crops.transforms.RandomSizedCrop,
                    A.augmentations.geometric.resize.Resize,
                    A.augmentations.geometric.rotate.Rotate,
                    A.augmentations.geometric.rotate.SafeRotate,
                    A.augmentations.geometric.transforms.ShiftScaleRotate,
                    A.augmentations.geometric.resize.SmallestMaxSize,
                    A.augmentations.transforms.Transpose,
                    A.augmentations.transforms.VerticalFlip]
        
        # Load label if available
        if self.label is not None:
            label_path = self.label.loc[idx].label_path
            with rasterio.open(label_path) as lp:
                y_arr = lp.read(1).astype("float32")
            
            # Apply data augmentations to the label - ONLY SPATIAL TRANSFORMS CAN BE APPLIED TO LABEL
            if self.transforms:
                
                # Create list of valid spatial transforms from list of transforms applied to train images
                valid_label_transforms = [transform for transform in self.transforms if type(transform) in spatial_transforms]
                
                # Apply only valid transforms to the label
                self.transforms = A.Compose(valid_label_transforms)
                y_arr = self.transforms(image=y_arr)["image"]
            
            item["label"] = y_arr

        return item

Writing benchmark_src/cloud_dataset.py


Added way to activate transforms in call to CloudDataset

In [12]:
%%file {submission_dir}/cloud_model.py
from typing import Optional, List
import albumentations as A
import pandas as pd
import pytorch_lightning as pl
import segmentation_models_pytorch as smp
import torch

try:
    from cloud_dataset import CloudDataset
    from losses import intersection_over_union
except ImportError:
    from benchmark_src.cloud_dataset import CloudDataset
    from benchmark_src.losses import intersection_over_union


class CloudModel(pl.LightningModule):
    def __init__(
        self,
        bands: List[str],
        x_train: Optional[pd.DataFrame] = None,
        y_train: Optional[pd.DataFrame] = None,
        x_val: Optional[pd.DataFrame] = None,
        y_val: Optional[pd.DataFrame] = None,
        hparams: dict = {},
    ):
        """
        Instantiate the CloudModel class based on the pl.LightningModule
        (https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html).

        Args:
            bands (list[str]): Names of the bands provided for each chip
            x_train (pd.DataFrame, optional): a dataframe of the training features with a row for each chip.
                There must be a column for chip_id, and a column with the path to the TIF for each of bands.
                Required for model training
            y_train (pd.DataFrame, optional): a dataframe of the training labels with a for each chip
                and columns for chip_id and the path to the label TIF with ground truth cloud cover.
                Required for model training
            x_val (pd.DataFrame, optional): a dataframe of the validation features with a row for each chip.
                There must be a column for chip_id, and a column with the path to the TIF for each of bands.
                Required for model training
            y_val (pd.DataFrame, optional): a dataframe of the validation labels with a for each chip
                and columns for chip_id and the path to the label TIF with ground truth cloud cover.
                Required for model training
            hparams (dict, optional): Dictionary of additional modeling parameters.
        """
        super().__init__()
        self.hparams.update(hparams)
        self.save_hyperparameters()

        # required
        self.bands = bands

        # optional modeling params
        self.backbone = self.hparams.get("backbone", "inceptionv4")
        self.weights = self.hparams.get("weights", "imagenet")
        self.learning_rate = self.hparams.get("lr", 1e-3)
        self.patience = self.hparams.get("patience", 8)
        self.num_workers = self.hparams.get("num_workers", 2)
        self.batch_size = self.hparams.get("batch_size", 32)
        self.gpu = self.hparams.get("gpu", False)
        
        # Leave self.tranform = None in order to NOT apply augmentations in 
        # call to CloudDataset
        self.transform = None
        
        # Uncomment line below and edit transforms in call to A.Compose to apply 
        # augmentations from Albumentations
        # self.transform = A.Compose([A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, always_apply=False, p=0.5)])

        # Instantiate datasets, model, and trainer params if provided
        self.train_dataset = CloudDataset(
            x_paths=x_train,
            bands=self.bands,
            y_paths=y_train,
            transforms=self.transform, # Will apply transforms if self.transform = A.Compose() is uncommented
        )
        self.val_dataset = CloudDataset(
            x_paths=x_val,
            bands=self.bands,
            y_paths=y_val,
            transforms=None,
        )
        self.model = self._prepare_model()

    ## Required LightningModule methods ##

    def forward(self, image: torch.Tensor):
        # Forward pass
        return self.model(image)

    def training_step(self, batch: dict, batch_idx: int):
        """
        Training step.

        Args:
            batch (dict): dictionary of items from CloudDataset of the form
                {'chip_id': list[str], 'chip': list[torch.Tensor], 'label': list[torch.Tensor]}
            batch_idx (int): batch number
        """
        if self.train_dataset.data is None:
            raise ValueError(
                "x_train and y_train must be specified when CloudModel is instantiated to run training"
            )

        # Switch on training mode
        self.model.train()
        torch.set_grad_enabled(True)

        # Load images and labels
        x = batch["chip"]
        y = batch["label"].long()
        if self.gpu:
            x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True)

        # Forward pass
        preds = self.forward(x)

        # Log batch loss
        loss = torch.nn.CrossEntropyLoss(reduction="none")(preds, y).mean()
        self.log(
            "loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(self, batch: dict, batch_idx: int):
        """
        Validation step.

        Args:
            batch (dict): dictionary of items from CloudDataset of the form
                {'chip_id': list[str], 'chip': list[torch.Tensor], 'label': list[torch.Tensor]}
            batch_idx (int): batch number
        """
        if self.val_dataset.data is None:
            raise ValueError(
                "x_val and y_val must be specified when CloudModel is instantiated to run validation"
            )

        # Switch on validation mode
        self.model.eval()
        torch.set_grad_enabled(False)

        # Load images and labels
        x = batch["chip"]
        y = batch["label"].long()
        if self.gpu:
            x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True)

        # Forward pass & softmax
        preds = self.forward(x)
        preds = torch.softmax(preds, dim=1)[:, 1]
        preds = (preds > 0.5) * 1  # convert to int

        # Log batch IOU
        batch_iou = intersection_over_union(preds, y)
        self.log(
            "iou", batch_iou, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return batch_iou

    def train_dataloader(self):
        # DataLoader class for training
        return torch.utils.data.DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            shuffle=True,
            pin_memory=True,
        )

    def val_dataloader(self):
        # DataLoader class for validation
        return torch.utils.data.DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=0,
            shuffle=False,
            pin_memory=True,
        )

    def configure_optimizers(self):
        opt = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=10)
        return [opt], [sch]

    ## Convenience Methods ##

    def _prepare_model(self):
        # Instantiate U-Net model
        unet_model = smp.Unet(
            encoder_name=self.backbone,
            encoder_weights=self.weights,
            in_channels=4,
            classes=2,
        )
        if self.gpu:
            unet_model.cuda()

        return unet_model

Writing benchmark_src/cloud_model.py


#### Step 2b: Define Functions

In [13]:
def add_paths(df, feature_dir, label_dir=None, bands=BANDS):
    """
    Given dataframe with a column for chip_id, returns a dataframe with a column
    added indicating the path to each band's TIF image as "{band}_path", eg "B02_path".
    A column is also added to the dataframe with paths to the label TIF, if the
    path to the labels directory is provided.
    """
    for band in bands:
        df[f"{band}_path"] = feature_dir / df["chip_id"] / f"{band}.tif"
        # make sure a random sample of paths exist
        assert df.sample(n=40, random_state=5)[f"{band}_path"].path.exists().all()
    if label_dir is not None:
        df["label_path"] = label_dir / (df["chip_id"] + ".tif")
        # make sure a random sample of paths exist
        assert df.sample(n=40, random_state=5)["label_path"].path.exists().all()

    return df

In [14]:
%%file {submission_dir}/losses.py
import numpy as np

# Loss function
def intersection_over_union(pred, true):
    """
    Calculates intersection and union for a batch of images.

    Args:
        pred (torch.Tensor): a tensor of predictions
        true (torc.Tensor): a tensor of labels

    Returns:
        intersection (int): total intersection of pixels
        union (int): total union of pixels
    """
    valid_pixel_mask = true.ne(255)  # valid pixel mask
    true = true.masked_select(valid_pixel_mask).to("cpu")
    pred = pred.masked_select(valid_pixel_mask).to("cpu")

    # Intersection and union totals
    intersection = np.logical_and(true, pred)
    union = np.logical_or(true, pred)
    return intersection.sum() / union.sum()

Writing benchmark_src/losses.py


In [15]:
# Import the cloud model we just saved.
from benchmark_src.cloud_model import CloudModel

---
### Step 3: Preprocessing
---

#### Step 3a: Establish a standard collection of chips for model testing--remove bad chips before starting

Upon visual inspection, we discovered that some chips whose labels indicated "no cloud" actually had clearly defineable clouds in them, and some chips whose labels indicated "100% cloud" were actually not completely covered by clouds.

While it was not possible to inspect every pixel of every chip in the dataset, we felt that the most egregious labelling errors from the total universe of chips under consideration should be removed before they were split into training, validation, and test sets.

In [16]:
train_meta = add_paths(train_meta, TRAIN_FEATURES, TRAIN_LABELS)
train_meta.shape  # this is the original size of our universe of chips
print(f"Our initial universe consists of {train_meta.shape[0]} chips.")

Our initial universe consists of 11748 chips.


In [17]:
# We create our own copy of the data called "full_set" for subsequent use
feature_cols = ["chip_id"] + [f"{band}_path" for band in BANDS]
full_set = train_meta.copy()
full_set_x=full_set[feature_cols].copy()
full_set_y=full_set[["chip_id", "label_path"]].copy()

In [18]:
# This function allows you to display a chip by name (useful for debugging)

# display_true_color_label_pixel_count('byka', full_set)

##### Collecting lists of mislabelled chips and put them in csv files...

In [20]:
# Our focus was on removing chips that indicated that incorrectly labelled "no cloud" or "fully cloud":

# -Some chips were labelled as having 0 cloud pixels when the clearly had clouds upon visual inspection

# -Some chips were labelled as every pixel was a cloud, even though the chip did not have full cloud cover

# # This cell takes about 10-12 minutes to run--leave this cell and the ones below commented out unless you want to
# # recount the list of chips whose cells are either 0 clouds or all clouds. Since these outputs are written to csv files
# # below, they do not need to be rerun each time, the information can be pulled directly from the csv files.

# list_of_no_cloud_chips = []
# list_of_all_cloud_chips = []
# for index, row in full_set_y.iterrows():
#     index_id = index
#     chip_id = row[0]
#     label_path = row[1]
#     with Image.open(label_path) as im:
#         label_arr = np.array(im)
#         ones_count = np.count_nonzero(label_arr)
#         if ones_count == 0:
#             list_of_no_cloud_chips.append(chip_id)
#         if ones_count == 512*512:
#             list_of_all_cloud_chips.append(chip_id)

In [21]:
# # Prints a list of chips labelled "NO CLOUD"
# print('list of chips labelled "NO CLOUD:"')
# print(list_of_no_cloud_chips)

# # Prints a list of chips labelled "ALL CLOUD"
# print('\nlist of chips labelled "ALL CLOUD:"')
# print(list_of_all_cloud_chips)

In [22]:
# # Writing lists of 'no cloud' and 'all cloud' chips to csv files (leave this section commented out after data cleaned)...

# # Store chips with no clouds in 'no_cloud_chips.csv'

# list_of_no_cloud_chips_df = pd.DataFrame(list_of_no_cloud_chips, columns=['chip_id'])
# list_of_no_cloud_chips_df.to_csv('no_cloud_chips.csv', index=False, header=False)
# with open('no_cloud_chips.csv', 'w+', newline='') as file:     
#     write = csv.writer(file, delimiter='\n', lineterminator='\n') 
#     write.writerow(list_of_no_cloud_chips) 


# # Store chips with no clouds in 'no_cloud_chips.csv'

# list_of_all_cloud_chips_df = pd.DataFrame(list_of_all_cloud_chips, columns=['chip_id'])
# list_of_all_cloud_chips_df.to_csv('all_cloud_chips.csv', index=False, header=False)
# with open('all_cloud_chips.csv', 'w+', newline='') as file:     
#     write = csv.writer(file, delimiter='\n', lineterminator='\n') 
#     write.writerow(list_of_all_cloud_chips) 

In [23]:
## Visually inspect chips whose labels have 0 cloud pixels and manually record mislabelled chips in a separate csv file (leave this section commented out after data cleaned)

# # Select chips from csv files for display
# with open('no_cloud_chips.csv', newline='') as f:
#     reader = csv.reader(f)
#     no_cloud_list = list(reader)

# # flatten list
# flat_no_cloud_list = [item for sublist in no_cloud_list for item in sublist]
# print(f"There are {len(flat_no_cloud_list)} chips out of the original {full_set.shape[0]} whose labels indicate 0 cloud pixels.") 
# # print(flat_no_cloud_list)   # Uncomment this line to see a listing of chips whose labels have 0 cloud pixels

In [24]:
# # subset a dataframe of just those chips
# inspect_no_clouds_set = full_set.loc[full_set['chip_id'].isin(flat_no_cloud_list)]

In [25]:
# # visually inspect these chips in groups of 100, and record (in a spreadsheet) which images clearly have clouds--
# # these are mislabelled and may throw off our models (it may take a minute or two to display)

# inspect_no_clouds_subset = inspect_no_clouds_set[0:100]  # adjust this slice index to display [100:200], [200:300], etc.

# fig, axs = plt.subplots(20, 5, figsize=(20,80))
# fig.tight_layout()
# counter = 0
# for i in range(20):
#     for j in range(5):
#         example_chip = inspect_no_clouds_subset.iloc[counter]
#         im = true_color_img(example_chip.chip_id)
#         axs[i,j].imshow(im)
#         axs[i,j].tick_params(left=False, labelleft=False, right=False)
#         axs[i,j].patch.set_visible(False)
#         axs[i,j].set_title(example_chip.chip_id, fontsize=20)
#         axs[i,j].axis('off')
#         counter += 1
# plt.show()

In [26]:
## Visually inspect chips whose labels have 100% (512 x 512 = 262,144) cloud pixels and manually record mislabelled chips in a separate csv file 

# # Select chips from csv files for display
# with open('all_cloud_chips.csv', newline='') as f:
#     reader = csv.reader(f)
#     all_cloud_list = list(reader)

# # flatten list
# flat_all_cloud_list = [item for sublist in all_cloud_list for item in sublist]
# print(f"There are {len(flat_all_cloud_list)} chips out of the original {full_set.shape[0]} whose labels indicate every pixel is a cloud.") 
# # print(flat_no_cloud_list)   # Uncomment this line to see a listing of chips whose labels have 0 cloud pixels

In [27]:
# # subset a dataframe of just those chips
# inspect_all_clouds_set = full_set.loc[full_set['chip_id'].isin(flat_all_cloud_list)]

In [28]:
# # visually inspect these chips in groups of 100, and record (in a spreadsheet) which images clearly have clouds--
# # these are mislabelled and may throw off our models

# inspect_all_clouds_subset = inspect_all_clouds_set[0:100]  # adjust this slice index to see [100:200], [200:300], etc.

# fig, axs = plt.subplots(20, 5, figsize=(20,80))
# fig.tight_layout()
# counter = 0
# for i in range(20):
#     for j in range(5):
#         example_chip = inspect_all_clouds_subset.iloc[counter]
#         im = true_color_img(example_chip.chip_id)
#         axs[i,j].imshow(im)
#         axs[i,j].tick_params(left=False, labelleft=False, right=False)
#         axs[i,j].patch.set_visible(False)
#         axs[i,j].set_title(example_chip.chip_id, fontsize=20)
#         axs[i,j].axis('off')
#         counter += 1
# plt.show()

In [29]:
## Visually inspect the chips to be rejected
# Note: indexes in the rejection_list_set[0:205] were rejected because they were labelled "no clouds" when they had clouds.  Higher indexes were rejected because they were labelled "100% clouds" when clearly not every pixel was a cloud.

# # Select chips from csv files for display
# with open('incorrectly_labeled_chips.csv', newline='') as f:
#     reader = csv.reader(f)
#     all_cloud_list = list(reader)

# # flatten list
# rejection_list = [item for sublist in all_cloud_list for item in sublist]
# print(f"There are {len(rejection_list)} chips out of the original {full_set.shape[0]} whose labels indicate every pixel is a cloud.") 
# # print(flat_no_cloud_list)   # Uncomment this line to see a listing of chips whose labels have 0 cloud pixels

In [30]:
# # subset a dataframe of just those chips
# rejection_list_set = full_set.loc[full_set['chip_id'].isin(rejection_list)]

In [31]:
# # visually inspect these chips in groups of 100, and record (in a spreadsheet) which images clearly have clouds--
# # these are mislabelled and may throw off our models

# rejection_list_subset = rejection_list_set[0:100]  # adjust this slice index to see [100:200], [200:300], etc.

# fig, axs = plt.subplots(20, 5, figsize=(20,80))
# fig.tight_layout()
# counter = 0
# for i in range(20):
#     for j in range(5):
#         example_chip = rejection_list_subset.iloc[counter]
#         im = true_color_img(example_chip.chip_id)
#         axs[i,j].imshow(im)
#         axs[i,j].tick_params(left=False, labelleft=False, right=False)
#         axs[i,j].patch.set_visible(False)
#         axs[i,j].set_title(example_chip.chip_id, fontsize=20)
#         axs[i,j].axis('off')
#         counter += 1
# plt.show()

#### Step 3b:  Remove the mislabeled chips from the model

In [32]:
full_set_cleaned = remove_chips(full_set, 'incorrectly_labeled_chips.csv') # train_meta renamed to this smaller group
display(full_set_cleaned)
print(f"A total of {full_set.shape[0]-full_set_cleaned.shape[0]} chips that were deemed mislabled have been removed the original {full_set.shape[0]} chips.  Our full_set dataset now contains {full_set_cleaned.shape[0]} chips.") 

Unnamed: 0,chip_id,location,datetime,cloudpath,B02_path,B03_path,B04_path,B08_path,label_path
0,adwp,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adwp,/driven-data/cloud-cover/train_features/adwp/B...,/driven-data/cloud-cover/train_features/adwp/B...,/driven-data/cloud-cover/train_features/adwp/B...,/driven-data/cloud-cover/train_features/adwp/B...,/driven-data/cloud-cover/train_labels/adwp.tif
1,adwu,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adwu,/driven-data/cloud-cover/train_features/adwu/B...,/driven-data/cloud-cover/train_features/adwu/B...,/driven-data/cloud-cover/train_features/adwu/B...,/driven-data/cloud-cover/train_features/adwu/B...,/driven-data/cloud-cover/train_labels/adwu.tif
3,adxp,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/adxp,/driven-data/cloud-cover/train_features/adxp/B...,/driven-data/cloud-cover/train_features/adxp/B...,/driven-data/cloud-cover/train_features/adxp/B...,/driven-data/cloud-cover/train_features/adxp/B...,/driven-data/cloud-cover/train_labels/adxp.tif
4,aeaj,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/aeaj,/driven-data/cloud-cover/train_features/aeaj/B...,/driven-data/cloud-cover/train_features/aeaj/B...,/driven-data/cloud-cover/train_features/aeaj/B...,/driven-data/cloud-cover/train_features/aeaj/B...,/driven-data/cloud-cover/train_labels/aeaj.tif
5,aeap,Chifunfu,2020-04-29T08:20:47Z,az://./train_features/aeap,/driven-data/cloud-cover/train_features/aeap/B...,/driven-data/cloud-cover/train_features/aeap/B...,/driven-data/cloud-cover/train_features/aeap/B...,/driven-data/cloud-cover/train_features/aeap/B...,/driven-data/cloud-cover/train_labels/aeap.tif
...,...,...,...,...,...,...,...,...,...
11741,zxuw,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxuw,/driven-data/cloud-cover/train_features/zxuw/B...,/driven-data/cloud-cover/train_features/zxuw/B...,/driven-data/cloud-cover/train_features/zxuw/B...,/driven-data/cloud-cover/train_features/zxuw/B...,/driven-data/cloud-cover/train_labels/zxuw.tif
11742,zxvi,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxvi,/driven-data/cloud-cover/train_features/zxvi/B...,/driven-data/cloud-cover/train_features/zxvi/B...,/driven-data/cloud-cover/train_features/zxvi/B...,/driven-data/cloud-cover/train_features/zxvi/B...,/driven-data/cloud-cover/train_labels/zxvi.tif
11744,zxxo,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxxo,/driven-data/cloud-cover/train_features/zxxo/B...,/driven-data/cloud-cover/train_features/zxxo/B...,/driven-data/cloud-cover/train_features/zxxo/B...,/driven-data/cloud-cover/train_features/zxxo/B...,/driven-data/cloud-cover/train_labels/zxxo.tif
11745,zxym,Launceston,2020-09-06T00:08:20Z,az://./train_features/zxym,/driven-data/cloud-cover/train_features/zxym/B...,/driven-data/cloud-cover/train_features/zxym/B...,/driven-data/cloud-cover/train_features/zxym/B...,/driven-data/cloud-cover/train_features/zxym/B...,/driven-data/cloud-cover/train_labels/zxym.tif


A total of 762 chips that were deemed mislabled have been removed the original 11748 chips.  Our full_set dataset now contains 10986 chips.


<a id='split-data'></a>

#### Step 3c: Split the data  

To train our model, we want to separate the data into a "training", "validation", and "test" set. That way we'll have a portion of labelled data that was not used in model training, which can give us a more accurate sense of how our model will perform on the competition test data. Remember, none of the test set locations are in competition training data, so your model's will performance will ultimately be measured on unseen locations.

For each geography, we split our chips randomly into 60% training, 20% validation and 20% test.

In [33]:
train, val, test = train_val_test_split(full_set_cleaned, column='location', pct_train=0.6, pct_val=0.2, pct_test=0.2, random_state=42)   

In [34]:
full_set_cleaned.shape, train.shape, val.shape, test.shape

((10986, 9), (6556, 9), (2198, 9), (2232, 9))

In [35]:
# separate features from labels
feature_cols = ["chip_id"] + [f"{band}_path" for band in BANDS]

train_x = train[feature_cols].copy()
train_y = train[["chip_id", "label_path"]].copy()

val_x = val[feature_cols].copy()
val_y = val[["chip_id", "label_path"]].copy()

test_x = test[feature_cols].copy()
test_y = test[["chip_id", "label_path"]].copy()

---
### Step 4: Model Training
---

In [36]:
warnings.filterwarnings("ignore")

In [37]:
## Set up pytorch_lightning.Trainer object
cloud_model = CloudModel(
    bands=BANDS,
    x_train=train_x,
    y_train=train_y,
    x_val=val_x,
    y_val=val_y,
    hparams={"num_workers": 7, "batch_size": 16},
)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor="iou_epoch", mode="max", verbose=True
)
early_stopping_callback = pl.callbacks.early_stopping.EarlyStopping(
    monitor="iou_epoch",
    patience=(cloud_model.patience),
    mode="max",
    verbose=True,
)

trainer = pl.Trainer(
    gpus=1,
    fast_dev_run=False,
    callbacks=[checkpoint_callback, early_stopping_callback],
    logger = wandb_logger
)

Downloading: "http://data.lip6.fr/cadene/pretrainedmodels/inceptionv4-8e4777a0.pth" to /home/jovyan/.cache/torch/hub/checkpoints/inceptionv4-8e4777a0.pth


  0%|          | 0.00/163M [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [38]:
# Fit the model
trainer.fit(model = cloud_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mw207-clouds[0m (use `wandb login --relogin` to force relogin)



  | Name  | Type | Params
-------------------------------
0 | model | Unet | 48.8 M
-------------------------------
48.8 M    Trainable params
0         Non-trainable params
48.8 M    Total params
195.169   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved. New best score: 0.869
Epoch 0, global step 409: iou_epoch reached 0.86902 (best 0.86902), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=0-step=409.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.020 >= min_delta = 0.0. New best score: 0.889
Epoch 1, global step 819: iou_epoch reached 0.88887 (best 0.88887), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=1-step=819.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 1229: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.004 >= min_delta = 0.0. New best score: 0.893
Epoch 3, global step 1639: iou_epoch reached 0.89276 (best 0.89276), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=3-step=1639.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 2049: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.003 >= min_delta = 0.0. New best score: 0.896
Epoch 5, global step 2459: iou_epoch reached 0.89575 (best 0.89575), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=5-step=2459.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.007 >= min_delta = 0.0. New best score: 0.903
Epoch 6, global step 2869: iou_epoch reached 0.90319 (best 0.90319), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=6-step=2869.ckpt" as top 1
wandb: Network error (ReadTimeout), entering retry loop.


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 3279: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.003 >= min_delta = 0.0. New best score: 0.906
Epoch 8, global step 3689: iou_epoch reached 0.90578 (best 0.90578), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=8-step=3689.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Metric iou_epoch improved by 0.001 >= min_delta = 0.0. New best score: 0.907
Epoch 9, global step 4099: iou_epoch reached 0.90687 (best 0.90687), saving model to "/home/jovyan/ucb_mids_w207_final_project/notebooks/inception_v4/1r4fxc54/checkpoints/epoch=9-step=4099.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 4509: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 4919: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 5329: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 5739: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 6149: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 15, global step 6559: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 16, global step 6969: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 17, global step 7379: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 18, global step 7789: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 19, global step 8199: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 20, global step 8609: iou_epoch was not in top 1


Validating: 0it [00:00, ?it/s]

Monitored metric iou_epoch did not improve in the last 12 records. Best score: 0.907. Signaling Trainer to stop.
Epoch 21, global step 9019: iou_epoch was not in top 1


# Best score: 0.907

In [39]:
# save the model
submission_assets_dir = submission_dir / "assets"
submission_assets_dir.mkdir(parents=True, exist_ok=True)

model_weight_path = submission_assets_dir / "cloud_model.pt"
torch.save(cloud_model.state_dict(), model_weight_path)