In [1]:
import numpy as np
import xarray as xr

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

from source.anomalyDetector import AnomalyDetector

2025-12-05 11:38:29.362430: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Autoencoder

#### Reading the data

In [6]:
ds_2025 = xr.open_dataset("files/2025_KVS_deployment_flagged.nc")
ds_2024 = xr.open_dataset("files/2024_KVS_deployment_flagged.nc")

In [7]:
ds_2025_1m_temp = ds_2025["temp_1m_calibrated"]
ds_2025_1m_labels = ds_2025["temp_1m_quality_flag"]
ds_2025_ir_temp = ds_2025["temp_snowsurface"]
ds_2025_temp_diff = ds_2025_1m_temp - ds_2025_ir_temp

ds_2024_1m_temp = ds_2024["temp_1m_calibrated"]
ds_2024_1m_lab = ds_2024["temp_1m_quality_flag"]
ds_2024_ir_temp = ds_2024["temp_snowsurface_calibrated"]
ds_2024_temp_diff = ds_2024_1m_temp - ds_2024_ir_temp

In [8]:
train_data, test_data, train_labels, test_labels = train_test_split(ds_2025_temp_diff,
                                                                    ds_2025_1m_labels,
                                                                    test_size=0.2,
                                                                    )

#### Scaling the data

In [9]:
min_val = tf.reduce_min(train_data)
max_val = tf.reduce_max(train_data)

train_data = (train_data - min_val) / (max_val - min_val)
test_data = (test_data - min_val) / (max_val - min_val)

train_data = tf.cast(train_data, tf.float32)
test_data = tf.cast(test_data, tf.float32)


2025-12-01 15:32:59.123084: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [11]:
train_labels = ds_2025_1m_labels.astype(bool)
test_labels = test_labels.astype(bool)

normal_train_data = train_data[train_labels]
normal_test_data = test_data[test_labels]

anomalous_train_data = train_data[~train_labels]
anomalous_test_data = test_data[~test_labels]

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <xarray.DataArray 'temp_1m_quality_flag' (trajectory: 20, obs_temp: 3145)>
array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])
Coordinates:
  * trajectory  (trajectory) object '2025_04_KVS_SvalMIZ_01' ... '2025_04_KVS...
Dimensions without coordinates: obs_temp
Attributes:
    long_name:   Quality flag for temperature observations 1 m above the surface
    definition:  0 - flagged data

In [2]:
autoencoder = AnomalyDetector()

2025-12-05 11:38:35.623165: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [None]:
history = autoencoder.fit(normal_train_data, normal_train_data,
          epochs=20,
          batch_size=512,
          validation_data=(test_data, test_data),
          shuffle=True)

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()


In [None]:
encoded_data = autoencoder.encoder(normal_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.plot(normal_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(140), decoded_data[0], normal_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()


In [None]:
encoded_data = autoencoder.encoder(anomalous_test_data).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

plt.plot(anomalous_test_data[0], 'b')
plt.plot(decoded_data[0], 'r')
plt.fill_between(np.arange(140), decoded_data[0], anomalous_test_data[0], color='lightcoral')
plt.legend(labels=["Input", "Reconstruction", "Error"])
plt.show()


Detect anomalies by calculating whether the reconstruction loss is greater than a fixed threshold. 

reconstructions = autoencoder.predict(normal_train_data)
train_loss = tf.keras.losses.mae(reconstructions, normal_train_data)

plt.hist(train_loss[None,:], bins=50)
plt.xlabel("Train loss")
plt.ylabel("No of examples")
plt.show()

threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)



