# d7046e ANN course project , Group 6

The project goal is to implement a classifier for clear/cloudy sky based satellite image data. 

**Download and some checks**

In [None]:
# downloads and extract the data from, https://drive.google.com/drive/folders/1lRCIcQo9CqFRDhUd3aZRAA46k8nLL49J

import gdown
import zipfile
import os

CLOUD_FILE_ID = "19MBh9JIJTxYIPAeO7G5RML5_ddjJ1Cpa"
CLOUD_DOWN_ENDPOINT = "https://drive.google.com/uc?id="

base_dir = os.path.abspath("../data/")  
zip_path = os.path.join(base_dir, "data.zip")  
extract_path = os.path.join(base_dir) # set to same not to create to many sub folders 

os.makedirs(extract_path, exist_ok=True)

print(f"downloading dataset to: {zip_path}")
gdown.download(f"{CLOUD_DOWN_ENDPOINT}{CLOUD_FILE_ID}", zip_path, quiet=False)

print(f"extracting to: {extract_path}")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

os.remove(zip_path)
print(f"deleted zip file: {zip_path}")

print("download and extraction completed.")
print(f"extracted files in: {extract_path}")
print(f"extracted content: {os.listdir(extract_path)}")


Trying to figure out how the data is structured.

In [None]:
import numpy as np

# length of gts files, trying to find out what to use and make sure where tha labels are.
train_gts = np.load("../data/skogsstyrelsen-data/skogs_gts_train.npy")
train_names = np.load("../data/skogsstyrelsen-data/skogs_names_train.npy")
print(f"length of train_gts:{len(train_gts)} , length of train_names:{len(train_names)}")

val_gts = np.load("../data/skogsstyrelsen-data/skogs_gts_val.npy")
val_names = np.load("../data/skogsstyrelsen-data/skogs_names_val.npy")
print(f"length of train_gts:{len(val_gts)} , length of train_names:{len(val_names)}")


Sample on of the objects in the .nc files

In [None]:
# print a .nc array 
import xarray as xar
ds = xar.open_dataset("../data/skogsstyrelsen-data/2A-netcdfs-cropped-from-nuria/skgs_0b5101fb-44c7-ed11-9174-005056a6f472.nc") # first file from the download folder
print(ds)

scl_value = ds['scl'].values
print(f"\nscl_value:\n{scl_value}")

bands = []

# print some of the object attributes, familiarize with the structure of the data
for band in ds.data_vars:
    data_array = ds[band]
    values = data_array.values
    bands.append(values)

    print(f"Variable name: {band}")
    print(f"Shape: {values.shape}")
    print(f"Dtype: {values.dtype}\n")

print(f"bands array:\n{bands}")


**Setup Cell**

In [None]:
import torch
import xarray as xar
import numpy as np
from torch.utils.data import Dataset, DataLoader

DATA_PATH = "../data/skogsstyrelsen-data/"
BANDS = ['b01', 'b02', 'b03', 'b04', 'b05', 'b06', 'b07', 'b08', 'b8a', 'b09', 'b11', 'b12']
IMAGE_SIZE = 20 # bands array from the above print seems to have shape 20X20, 2D 
EPOCHS = 50
BATCH_SIZE = 32


if torch.cuda.is_available():
    DEVICE = "cuda"
else:
    DEVICE = "cpu"
    print(
        "Cuda does not seem to be available in your environment, try installing a torch package compiled with cuda enabled if u have cuda installed in ur os.\n"
        "Uninstall torch, torchaudio, torchvision, and then run: torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n"
    )
    
print(f"Running with: DEVICE = {DEVICE}")

**Utils functions to load the data from skogs**

In [None]:

def get_skogs_paths(split_suffix: str):
    """ Returns concatenated paths depending on what split that is needed. """
    names_path = os.path.join(DATA_PATH, f"skogs_names_{split_suffix}.npy")
    labels_path = os.path.join(os.path.join(DATA_PATH, f"skogs_gts_{split_suffix}.npy"))
    return names_path, labels_path

def load_skogs_data(split_suffix: str):
    """Loads a dataset split into numpy arrays for images and binary labels."""
    names_path, labels_path = get_skogs_paths(split_suffix)
    names = np.load(names_path, allow_pickle=True)  # list of paths to nc files
    labels = np.load(labels_path, allow_pickle=True).astype(np.float32)  # binary values from the gts files

    x_list = []
    y_list = []

    for name, label in zip(names, labels):  
        nc_file = os.path.join(DATA_PATH, "2A-netcdfs-cropped-from-nuria", os.path.basename(name)) # could not get the path to work with just the path from names file
        nc_file = os.path.normpath(nc_file)  # normalize path for different OS

        if os.path.exists(nc_file):
            with xar.open_dataset(nc_file, engine="netcdf4") as ds:
                band_arrays = [(ds[band].values.squeeze() - 1000) / 10000 for band in BANDS]

            # reshape bands of different shapes 
            band_arrays_fixed = []
            for band in band_arrays:
                if band.shape[0] >= IMAGE_SIZE and band.shape[1] >= IMAGE_SIZE:
                    band_fixed = torch.tensor(band[:IMAGE_SIZE, :IMAGE_SIZE], dtype=torch.float32) # slice the top left part of the image consistently, 
                else:                                                                              # mby need to change the slice if important features are lost
                    band_fixed = torch.zeros((IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32)
                
                band_arrays_fixed.append(band_fixed)

                x = torch.stack(band_arrays_fixed)  # should be of size (batch_size, 20, 20) when stacking
        else:
            print(f"File not found: {nc_file}")
            x = torch.zeros((len(BANDS), IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32) 

        x_list.append(x.numpy())  
        y_list.append(label)  

    return np.array(x_list), np.array(y_list)  

class SkogsBinaryDataset(Dataset):
    """ PyTorch dataset for the stacked band arrays (X) and binary label (y). """
    def __init__(self, x_data, y_data):
        self.x_data = x_data # tensor for the image
        self.y_data = y_data # label for the image

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):
        x = torch.tensor(self.x_data[idx], dtype=torch.float32)  # shape: (bands, rows, cols), (12, 21, 20)
        y = torch.tensor(self.y_data[idx], dtype=torch.float32)  # shape: (72), just the labels from the gts file
        return x, y


**Set up data loaders**

In [None]:
def check_batch_shapes(ds: DataLoader):
    for x_batch, y_batch in ds:
        print(f"x_batch shape: {x_batch.shape}") # (batch_size, channels, height, width)
        print(f"y_batch shape: {y_batch.shape}") # (batch_size) , same amount of labels as input images

x_train, y_train = load_skogs_data("train")
x_val, y_val = load_skogs_data("val")
x_test, y_test = load_skogs_data("test")

train_ds = SkogsBinaryDataset(x_train, y_train)
val_ds = SkogsBinaryDataset(x_val, y_val)
test_ds = SkogsBinaryDataset(x_test, y_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True) 
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

# uncomment if u want to check shapes
#check_batch_shapes(train_loader)
check_batch_shapes(val_loader)
#check_batch_shapes(test_loader)


**Model architecture 1**