# Station Outlier filter

This notebook shows the "station outlier filter" to detect... 

The original R code stems from https://github.com/LottedeVos/PWSQC/. 

Publication:
de Vos, L. W., Leijnse, H., Overeem, A., & Uijlenhoet, R. (2019). Quality control for crowdsourced personal weather stations to enable operational rainfall monitoring. _Geophysical Research Letters_, 46(15), 8820-8829.

The idea of the filter is to... 

In [1]:
# Import packages

import warnings

import numpy as np
import xarray as xr
import poligrain as plg
import matplotlib.pyplot as plt
import pandas as pd

import time

In [2]:
#ds_pws = xr.open_dataset('C:/Users/a002461/OPENSENSE/data/OpenSense_PWS_data_FZ_filtered_rainfall.nc')
ds_pws = xr.open_dataset('OpenSense_PWS_example_format_data.nc')

In [3]:
#slice to one month
ds_pws = ds_pws.sel(time = slice('2017-07-01','2017-07-31'))

## Calculate distance matrix

In [4]:
ds_pws.coords["x"], ds_pws.coords["y"] = plg.spatial.project_point_coordinates(
    x=ds_pws.longitude, y=ds_pws.latitude, target_projection="EPSG:25832"
)

In [5]:
distance_matrix = plg.spatial.calc_point_to_point_distances(ds_pws, ds_pws)

## SO filter (fixed evaluation period only)

In [6]:
# Set parameters
mint = 4032
mrain = 100
mmatch = 200
gamma = 0.35 # 0.15 original
beta = 0.2
n_stat = 5
max_distance = 10e3 # Boolean 2D data array, defining neihbours within max_distance for all stations
dbc = 1
# time_len = len(ds_pws.time)

In [8]:
# prepare list pf NaNs to append when SO-filter cannot be applied ?
# corr_nan = np.empty((0, time_len,))

In [9]:
# initalize SO-flag with empty numpy array
# CAN THIS BE DONE DIFFERENTLY?

so_flag = np.ones_like(ds_pws.rainfall)
biascorrectiontable = np.ones_like(ds_pws.rainfall)

In [85]:
def SO_filter(da_station, da_neighbors, window_length, ds_pws):
    s_station = da_station.to_series()
    s_neighbors = da_neighbors.to_series()
    df_pws = s_station.to_frame()

    # NOTE! I think the calculation of rolling mean corr below is not ignoring NaNs.
    # But if I use apply(lambda x : np.mean(x)) it gets super slow again - how to avoid numpy?
    # https://stackoverflow.com/questions/71788354/pandas-rolling-mean-and-ignore-nan
    rolling_corr = s_station.rolling(window_length, min_periods=1).corr(s_neighbors).to_frame().unstack('id')
    
    n_rainy_timesteps = (s_neighbors > 0).rolling(window_length, min_periods=1).sum()
    mean_corr = rolling_corr.mean(axis = 1)

    # mean_corr = corr.rolling(window_length, min_periods=1).mean()
    # df_station["normal_mean"] = corr3.mean(axis=1)
    
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'rolling_corr': rolling_corr}, index = time))
    # ds['n_rainy_timesteps'] = xr.DataArray.from_series(n_rainy_timesteps)
    # ds_pws['mean_corr'] = xr.DataArray.from_series(rolling_corr, dim="id")
    return ds # df_station # ds

In [7]:
def SO_test(da_station, da_neighbors, window_length):
    s_station = da_station.to_series()
    s_neighbors = da_neighbors.to_series()
    
    corr = s_station.rolling(window_length, min_periods=1).corr(s_neighbors)
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'corr': corr}))
    mean_corr = ds.mean(dim = "id")
    return xr.where(mean_corr.corr < gamma, 1, 0) 


In [10]:
# add id as dimension as well to so_flag
ds_pws["so_flag"] = SO_test(ds_station.rainfall, ds_neighbors.rainfall, mint)
ds_pws

In [8]:
i = 0
ds_station = ds_pws.isel(id=i)
pws_id = ds_station.id.values

# one bias correction factor per station, iteratively updated PER TIME STEP ??
BCF_prev = dbc

# picking stations within max_distnance, excluding itself, for the whole duration of the time series
neighbor_ids = distance_matrix.id.data[(distance_matrix.sel(id=pws_id) < max_distance) & (distance_matrix.sel(id=pws_id) > 0)]

#create data set for neighbors
ds_neighbors = ds_pws.sel(id=neighbor_ids)


In [58]:
ds_pws["so_flag"] = xr.where(hej2.corr < gamma, 1, 0)
ds_pws

In [12]:
# s_station = ds_station.rainfall.to_series()
# s_neighbors = ds_neighbors.rainfall.to_series()
# corr5 = s_station.rolling(mint, min_periods=1).corr(s_neighbors).to_frame().unstack('id')
# corr5

In [15]:
i = 0
ds_station = ds_pws.isel(id=i)
pws_id = ds_station.id.values
neighbor_ids = distance_matrix.id.data[(distance_matrix.sel(id=pws_id) < max_distance) & (distance_matrix.sel(id=pws_id) > 0)]
ds_neighbors = ds_pws.sel(id=neighbor_ids)
ds_corr = SO_filter(ds_station.rainfall, ds_neighbors.rainfall, window_length=mint, ds_pws.isel(id = 0))

In [108]:
%%time

# corr_list = []
# bias_list = []

for i in [0]: # range(len(ds_pws.id)):

    # create data set for station i
    ds_station = ds_pws.isel(id=i)
    pws_id = ds_station.id.values

    # one bias correction factor per station, iteratively updated PER TIME STEP ??
    BCF_prev = dbc

    # picking stations within max_distnance, excluding itself, for the whole duration of the time series
    neighbor_ids = distance_matrix.id.data[(distance_matrix.sel(id=pws_id) < max_distance) & (distance_matrix.sel(id=pws_id) > 0)]

    #create data set for neighbors
    ds_neighbors = ds_pws.sel(id=neighbor_ids)

    # if there are less than mmatch overlapping rainy timesteps in the last mint period, filter cannot be applied
    # number of rainy time steps in rolling window of lenght mint
    # matches = rainy_timesteps(ds_station.rainfall, ds_neighbors.rainfall, window_length=mint)
    # xr.where(matches < mmatch, 1, 0)
    
    # if there are no observations in the time series, filter cannot be applied
    if ds_pws.rainfall.sel(id=pws_id).isnull().all():
        # print(pws_id, " has no data")
        
        so_flag[i, :] = -1
        # corr_list.append(corr_nan)
        
    # if there are not enough stations nearby, filter cannot be applied
    elif (len(neighbor_ids) < n_stat):
        # print(pws_id, "has less than", n_stat, "neighbors")
        
        so_flag[i, :] = -1
        # corr_list.append(corr_nan)

    else: 
        # print("calculating corr for", pws_id, "with neighbors")

        # create dataset with variable "corr" (correlation with each neighboring station)
        ds_corr = SO_filter(ds_station.rainfall, ds_neighbors.rainfall, window_length=mint)
        print(ds_corr)
        # print(pws_id, np.shape(ds_corr.corr.data))
        
        # corr['id_neighbor'] = corr.id_neighbor.astype(str)
        # corr_list.append(ds_corr.corr.median(dim='id'))


        median_corr = np.nanmean(ds_corr.corr)
        print(median_corr)
        # if median_corr < gamma:
            # print(pws_id, 'is a station outlier!')
            # so_flag[i, :] = -1
        #print(median_corr)
# MAKE THIS LINE WORK:       
# ds_pws['median_corr'] = (('id', 'time'), corr_list)

<xarray.Dataset>
Dimensions:            (id: 22, time: 8928)
Coordinates:
  * id                 (id) object 'ams2' 'ams3' 'ams4' ... 'ams24' 'ams26'
  * time               (time) datetime64[ns] 2017-07-01 ... 2017-07-31T23:55:00
Data variables:
    corr               (id, time) float64 nan 1.0 0.5 ... 0.08503 0.0849 0.0849
    n_rainy_timesteps  (id, time) float64 1.0 2.0 3.0 4.0 ... 278.0 279.0 279.0
0.32768810483069294
CPU times: total: 93.8 ms
Wall time: 93.9 ms


## Bias (next step to fix) 

In [59]:
def bias(station_rainfall, reference_rainfall):
    delta_r = station_rainfall - reference_rainfall
    return np.nanmean(delta_r) / np.nanmean(reference_rainfall)

In [None]:
# ds_bias = bias(ds_station.rainfall, ds_neighbors.rainfall, window_length=mint)
# bias_list.append(ds_corr.corr.median(dim='id'))

# Update bias or keep bias 

In [113]:
def SO_filter(da_station, da_neighbors, window_length):
    s_station = da_station.to_series()
    s_neighbors = da_neighbors.to_series()
    
    corr = s_station.rolling(window_length, min_periods=1).corr(s_neighbors)
    n_rainy_timesteps = (s_neighbors > 0).rolling(window_length, min_periods=1).sum()

    # NOTE! I think this is not ignoring NaNs. But if I use apply(lambda x : np.mean(x)) it gets super slow again - how to avoid numpy?
    # https://stackoverflow.com/questions/71788354/pandas-rolling-mean-and-ignore-nan
    mean_corr = corr.rolling(window_length, min_periods=1).mean()
    
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'corr': corr}))
    ds['n_rainy_timesteps'] = xr.DataArray.from_series(n_rainy_timesteps)
    ds['mean_corr'] = xr.DataArray.from_series(mean_corr)

    return ds


In [115]:
# ds_pws['median_corr'] = (('id', 'time'), corr_list)

In [None]:
# add flags to dataset
# ds_pws["so_flag"] = (("id", "time"), so_flag)
# ds_pws["bias"] = (("id", "time"), biascorrectiontable)

In [None]:
# ds_pws.to_netcdf('C:/Users/a002461/OPENSENSE/data/OpenSense_PWS_example_format_data_SO_flags.nc')

In [5]:
#so_flag = pypwsqc.flagging.so_filter(
#    ds_pws.rainfall,
#    ds_pws.nbrs_not_nan,
#    ds_pws.reference,
#    distance_matrix,
#    mint = 4032,
#    mrain = 100,
#    mmatch = 200,
#    gamma = 0.15,
#   beta = 0.2,
#    n_stat = 5,
#    max_distance = 10e3,
#)