# End-to-end Solar Out-of-Distribution (SOoD) pipeline

This notebook presents an end-to-end training and prediction pipeline for Anomaly Detection with a context-encoding variational autoencoder.

The model is based on the following paper: Zimmerer, David, et al. "Context-encoding variational autoencoder for unsupervised anomaly detection." arXiv preprint arXiv:1812.05941 (2018).

## Download Code & Data


In [1]:
!pip install wandb -qqq
import wandb

In [2]:
# Log in to your W&B account
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmariusgiger[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Data Acquisition

Make sure to run the setup and install `sdo-cli` first (`make setup` and `make install`).

In [22]:
# the dataset is available on the nas under /nas08-data02/astroml_data

In [3]:
!which python

/usr/bin/python


## Training


In [4]:
# training a ceVAE model for the downloaded images (note that a much larger dataset should be used in practice)
import os

data_dir = "/nas08-data02/astroml_data/sdomlv2_small/sdomlv2_small.zarr"
output_dir = "./output/train-sdo-ml"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

!../.venv/bin/sdo-cli sood ce_vae train \
    --target-size=256 \
    --data-dir=$data_dir \
    -o "./output/train-sdo-ml"\
    --ce-factor 0.5 \
    --print-every-iter 1 \
    --score-mode "combi" \
    --n-epochs 10 \
    --batch-size 32 \
    --num-data-loader-workers 8 \
    --dataset "SDOMLDatasetV2"

2022-05-27 12:32:50,190 torch.distributed.nn.jit.instantiator INFO Created a temporary directory at /tmp/tmp3m0_0ydv
2022-05-27 12:32:50,192 torch.distributed.nn.jit.instantiator INFO Writing /tmp/tmp3m0_0ydv/_remote_module_non_sriptable.py
discovered the following zarr directory structure
/
 └── 2010
     ├── 131A (6135, 512, 512) float32
     ├── 1600A (6136, 512, 512) float32
     ├── 1700A (6135, 512, 512) float32
     ├── 171A (6135, 512, 512) float32
     ├── 193A (6135, 512, 512) float32
     ├── 211A (6136, 512, 512) float32
     ├── 304A (6134, 512, 512) float32
     ├── 335A (6135, 512, 512) float32
     └── 94A (6136, 512, 512) float32
found 6135 images
discovered the following zarr directory structure
/
 └── 2010
     ├── 131A (6135, 512, 512) float32
     ├── 1600A (6136, 512, 512) float32
     ├── 1700A (6135, 512, 512) float32
     ├── 171A (6135, 512, 512) float32
     ├── 193A (6135, 512, 512) float32
     ├── 211A (6136, 512, 512) float32
     ├── 304A (6134, 512, 512

## Predict

In [None]:
import os, glob

def find_newest_dir(pattern):
    newest_dir = max(glob.glob(pattern), key=os.path.getmtime)
    print(newest_dir)
    return newest_dir

In [None]:
# pixel-level predictions

import os 

train_path = find_newest_dir('./output/train/*/')
load_path = train_path / Path('checkpoint')

pred_output_dir = "./output/pred"
if not os.path.exists(pred_output_dir):
    os.makedirs(pred_output_dir)
    
!sdo-cli sood ce_vae predict \
    --target-size=256 \
    --data-dir='./data/aia_171_2012_256' \
    --test-dir='./data/aia_171_2012_full_disk_flare_256' \
    --load-path={load_path} \
    -o './output/pred' \
    --logger "file" \
    --ce-factor 0.5 \
    --score-mode combi \
    --mode="pixel"

In [None]:
from sdo.sood.algorithms.ce_vae import ceVAE

load_path = "/Users/mariusgiger/Downloads/model_cevae_256.ckpt"
mode = "sample"
cevae_algo = ceVAE.load_from_checkpoint(
            load_path, mode=mode)
cevae_algo.eval()

In [None]:
import pathlib 
import numpy as np
import torch
from torchvision.transforms import Compose, Resize, Normalize, Lambda
import math
from torchvision.utils import save_image

CHANNEL_PREPROCESS = {
    "94": {"min": 0.1, "max": 800, "scaling": "log10"},
    "131": {"min": 0.7, "max": 1900, "scaling": "log10"},
    "171": {"min": 5, "max": 3500, "scaling": "log10"},
    "193": {"min": 20, "max": 5500, "scaling": "log10"},
    "211": {"min": 7, "max": 3500, "scaling": "log10"},
    "304": {"min": 0.1, "max": 3500, "scaling": "log10"},
    "335": {"min": 0.4, "max": 1000, "scaling": "log10"},
    "1600": {"min": 10, "max": 800, "scaling": "log10"},
    "1700": {"min": 220, "max": 5000, "scaling": "log10"},
    "4500": {"min": 4000, "max": 20000, "scaling": "log10"},
    "continuum": {"min": 0, "max": 65535, "scaling": None},
    "magnetogram": {"min": -250, "max": 250, "scaling": None},
    "bx": {"min": -250, "max": 250, "scaling": None},
    "by": {"min": -250, "max": 250, "scaling": None},
    "bz": {"min": -250, "max": 250, "scaling": None},
}

predict_img_path = pathlib.Path("/Users/mariusgiger/repos/master/test-sdo-ml-dataset-ae/extract/train/2012/01/01/AIA20120101_0000_0171.npz")

np_arr = np.load(predict_img_path)["x"]  # .astype(np.float64)
torch_arr = torch.from_numpy(np_arr)
# convert to 1 x H x W, to be in compatible torchvision format
torch_arr = torch_arr.unsqueeze(dim=0)

channel = "171"
preprocess_config = CHANNEL_PREPROCESS[channel.lower()]

target_size = 256
if preprocess_config["scaling"] == "log10":
    # TODO why was vflip(x) used here in SolarNet?
    def lambda_transform(x): return torch.log10(torch.clamp(
            x,
            min=preprocess_config["min"],
            max=preprocess_config["max"],
    ))
    mean = math.log10(preprocess_config["min"])
    std = math.log10(preprocess_config["max"]) - \
            math.log10(preprocess_config["min"])
else:
    def lambda_transform(x): return torch.clamp(
            x,
            min=preprocess_config["min"],
            max=preprocess_config["max"],
    )
    mean = preprocess_config["min"]
    std = preprocess_config["max"] - preprocess_config["min"]

transforms = Compose(
        [Resize((target_size, target_size)),
         Lambda(lambda_transform),
         Normalize(mean=[mean], std=[std]),
         # required to remove strange distribution of pixels (everything too bright)
         Normalize(mean=(0.5), std=(0.5))
         ]
)
torch_arr = transforms(torch_arr)
with torch.no_grad():
    cevae_algo.sample_mode()
    pred = cevae_algo.forward(torch_arr)
    print(pred)
cevae_algo.pixel_mode()
#NOTE this mode requires grad to be enabled
pred_img = cevae_algo.forward(torch_arr)
save_image(pred_img, "./output/test.png", normalize=True)    

In [None]:
# display inverted pixel-wise anomaly scores

from pathlib import Path
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina' 

aia_wave = 171
newest_dir = find_newest_dir('./output/pred/*/')
pixel_pred_path =  newest_dir / Path("predictions")
images = list(Path(pixel_pred_path).rglob(f'*__{aia_wave}.jpeg'))

f, axarr = plt.subplots(1,7, figsize=(20, 9))

row_index = 0
column_index = 0

for index, path in enumerate(images):
    img = Image.open(path)
    img_arr = np.invert(np.asarray(img))
    axarr[column_index].set_title(path.stem)
    axarr[column_index].imshow(img_arr, cmap='gray', vmin=0, vmax=255)
    axarr[column_index].spines['top'].set_visible(False)
    axarr[column_index].spines['right'].set_visible(False)
    axarr[column_index].spines['bottom'].set_visible(False)
    axarr[column_index].spines['left'].set_visible(False)
    axarr[column_index].xaxis.set_ticks([])
    axarr[column_index].yaxis.set_ticks([])

    if(column_index == 6):
        row_index = (row_index + 1)
        
    column_index = (column_index + 1) % 7

In [None]:
# sample-level predictions

!sdo-cli sood ce_vae predict \
    --target-size=256 \
    --data-dir='./data/aia_171_2012_256' \
    --test-dir='./data/aia_171_2012_full_disk_flare_256' \
    --load-path={load_path} \
    -o './output/pred' \
    --logger "file" \
    --ce-factor 0.5 \
    --score-mode combi \
    --mode="sample"

In [None]:
# investigate sample-wise scores

import pandas as pd

newest_dir = find_newest_dir('./output/pred/*/')
sample_pred_path =  newest_dir / Path("predictions/predictions.txt")

df = pd.read_csv(sample_pred_path, header=None, names = ["img", "score"])
df.head(7)

In [None]:
df.describe()

In [None]:
!sdo-cli sood ce_vae generate \
    --target-size=256 \
    --data-dir='./data/aia_171_2012_256' \
    --test-dir='./data/aia_171_2012_full_disk_flare_256' \
    --load-path={load_path} \
    -o './output/' \
    --logger "file" \
    --ce-factor 0.5 \
    --score-mode combi \
    --mode="sample"

In [None]:
gen_data_dir = "./output"
images = list(Path(gen_data_dir).rglob(f'*_generated.jpeg'))
img_path = images[0]

fig = plt.figure(figsize=(15, 15))
plt.axis('off')

src_img = Image.open(img_path)
plt.imshow(np.asarray(src_img))