# Station Outlier filter

This notebook shows the "station outlier filter" to detect... 

The original R code stems from https://github.com/LottedeVos/PWSQC/. 

Publication:
de Vos, L. W., Leijnse, H., Overeem, A., & Uijlenhoet, R. (2019). Quality control for crowdsourced personal weather stations to enable operational rainfall monitoring. _Geophysical Research Letters_, 46(15), 8820-8829.

The idea of the filter is to... 

In [1]:
import numpy as np
import xarray as xr
import poligrain as plg
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
ds_pws = xr.open_dataset('OpenSense_PWS_example_format_data.nc')

# slice to one month
ds_pws = ds_pws.sel(time = slice('2017-07-01','2017-07-31'))

## Calculate distance matrix

In [3]:
ds_pws.coords["x"], ds_pws.coords["y"] = plg.spatial.project_point_coordinates(
    x=ds_pws.longitude, y=ds_pws.latitude, target_projection="EPSG:25832"
)

In [4]:
distance_matrix = plg.spatial.calc_point_to_point_distances(ds_pws, ds_pws)

## SO filter (fixed evaluation period only)

In [5]:
# Set parameters
mint = 4032
mrain = 100
mmatch = 200
gamma = 0.15 
beta = 0.2
n_stat = 5
max_distance = 10e3 
dbc = 1

In [6]:
# initalize data variables
ds_pws['so_flag'] = xr.DataArray(np.ones((len(ds_pws.id), len(ds_pws.time)))*-999, dims=("id", "time"))
ds_pws['median_corr_nbrs'] = xr.DataArray(np.ones((len(ds_pws.id), len(ds_pws.time)))*-999, dims=("id", "time"))

In [48]:
def rainy_timesteps_station(da_station, window_length):
    df= da_station.to_dataframe()
    drop_columns = ['elevation', 'latitude','longitude','x','y','id']
    df= df.drop(drop_columns, axis = 1)
    rainy_timesteps = (df > 0).rolling(mint, min_periods=1).sum()
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': rainy_timesteps.rainfall}))
    return ds

In [56]:
def rainy_timesteps_neighbors(da_neighbors, window_length):
    df = da_neighbors.to_dataframe()
    drop_columns = ['elevation', 'latitude','longitude','x','y']
    df = df.drop(drop_columns, axis = 1)
    df = df["rainfall"].unstack("id")
    rainy_timesteps = (df > 0).rolling(mint, min_periods=1).sum()
    # ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': rainy_timesteps}))
    return rainy_timesteps
    

In [57]:
test_nbrs = rainy_timesteps_neighbors(ds_neighbors, mint)
test_nbrs

id,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,ams11,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
2017-07-01 00:05:00,2.0,0.0,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,...,2.0,1.0,2.0,2.0,2.0,0.0,1.0,2.0,2.0,2.0
2017-07-01 00:10:00,3.0,0.0,3.0,0.0,3.0,3.0,3.0,2.0,0.0,0.0,...,2.0,2.0,3.0,3.0,3.0,0.0,2.0,3.0,3.0,3.0
2017-07-01 00:15:00,4.0,0.0,4.0,0.0,4.0,4.0,4.0,3.0,0.0,0.0,...,3.0,3.0,4.0,4.0,4.0,0.0,3.0,4.0,4.0,4.0
2017-07-01 00:20:00,5.0,0.0,5.0,0.0,5.0,5.0,5.0,4.0,0.0,0.0,...,4.0,4.0,5.0,5.0,5.0,0.0,4.0,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [59]:
test_nbrs.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8928 entries, 2017-07-01 00:00:00 to 2017-07-31 23:55:00
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ams2    8928 non-null   float64
 1   ams3    8928 non-null   float64
 2   ams4    8928 non-null   float64
 3   ams5    8928 non-null   float64
 4   ams6    8928 non-null   float64
 5   ams7    8928 non-null   float64
 6   ams8    8928 non-null   float64
 7   ams9    8928 non-null   float64
 8   ams10   8928 non-null   float64
 9   ams11   8928 non-null   float64
 10  ams12   8928 non-null   float64
 11  ams13   8928 non-null   float64
 12  ams14   8928 non-null   float64
 13  ams15   8928 non-null   float64
 14  ams16   8928 non-null   float64
 15  ams17   8928 non-null   float64
 16  ams19   8928 non-null   float64
 17  ams20   8928 non-null   float64
 18  ams21   8928 non-null   float64
 19  ams23   8928 non-null   float64
 20  ams24   8928 non-null   float64
 21  a

In [41]:
df = ds_neighbors.to_dataframe()
drop_columns = ['elevation', 'latitude','longitude','x','y']
df = df.drop(drop_columns, axis = 1)
df = df["rainfall"].unstack("id")
rainy_timesteps = (df > 0).rolling(mint, min_periods=1).sum()
ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': rainy_timesteps}))
ds

ValueError: If using all scalar values, you must pass an index

In [21]:


ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': rainy_timesteps.rainfall}))

In [None]:
    rainy_timesteps = (df > 0).rolling(mint, min_periods=1).sum()
    # rainy_timesteps = roll_neighbors.where(df_count > 5).notnull().astype(int).sum(axis=1)

    # df_all = pd.concat([df_station, df_neighbors], axis=1)
    
    # rainy_timesteps = (df_all > 0).rolling(window_length, min_periods=1).sum()
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': rainy_timesteps}))

    # compare if rainy timesteps is above threshold mrain
    # if true --> apply SO-filter as it is now (fixed window)
    # if false ---> go back in time?
    return ds

In [106]:
yup = rainy_timesteps_neighbors(ds_neighbors.rainfall, mint)
yup

id,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,ams11,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
2017-07-01 00:05:00,2.0,0.0,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,...,2.0,1.0,2.0,2.0,2.0,0.0,1.0,2.0,2.0,2.0
2017-07-01 00:10:00,3.0,0.0,3.0,0.0,3.0,3.0,3.0,2.0,0.0,0.0,...,2.0,2.0,3.0,3.0,3.0,0.0,2.0,3.0,3.0,3.0
2017-07-01 00:15:00,4.0,0.0,4.0,0.0,4.0,4.0,4.0,3.0,0.0,0.0,...,3.0,3.0,4.0,4.0,4.0,0.0,3.0,4.0,4.0,4.0
2017-07-01 00:20:00,5.0,0.0,5.0,0.0,5.0,5.0,5.0,4.0,0.0,0.0,...,4.0,4.0,5.0,5.0,5.0,0.0,4.0,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [9]:
# ds = xr.Dataset.from_dataframe(pd.DataFrame({'rainy_timesteps': yup}))

In [87]:
roll_neighbors = (df_neighbors > 0).rolling(mint, min_periods=1).sum()
roll_neighbors =

id,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,ams11,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
2017-07-01 00:05:00,2.0,0.0,2.0,0.0,2.0,2.0,2.0,1.0,0.0,0.0,...,2.0,1.0,2.0,2.0,2.0,0.0,1.0,2.0,2.0,2.0
2017-07-01 00:10:00,3.0,0.0,3.0,0.0,3.0,3.0,3.0,2.0,0.0,0.0,...,2.0,2.0,3.0,3.0,3.0,0.0,2.0,3.0,3.0,3.0
2017-07-01 00:15:00,4.0,0.0,4.0,0.0,4.0,4.0,4.0,3.0,0.0,0.0,...,3.0,3.0,4.0,4.0,4.0,0.0,3.0,4.0,4.0,4.0
2017-07-01 00:20:00,5.0,0.0,5.0,0.0,5.0,5.0,5.0,4.0,0.0,0.0,...,4.0,4.0,5.0,5.0,5.0,0.0,4.0,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,0.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [94]:
roll_neighbors.where(df_count > 5).notnull().astype(int).sum(axis=1)

time
2017-07-01 00:00:00     0
2017-07-01 00:05:00     0
2017-07-01 00:10:00     0
2017-07-01 00:15:00     0
2017-07-01 00:20:00     0
                       ..
2017-07-31 23:35:00    19
2017-07-31 23:40:00    19
2017-07-31 23:45:00    19
2017-07-31 23:50:00    19
2017-07-31 23:55:00    19
Length: 8928, dtype: int64

In [53]:
# dfsum(axis=1)

df_count[ df_count > 5].sum(axis = 1)

time
2017-07-01 00:00:00       0.0
2017-07-01 00:05:00       0.0
2017-07-01 00:10:00       0.0
2017-07-01 00:15:00       0.0
2017-07-01 00:20:00       0.0
                        ...  
2017-07-31 23:35:00    4121.0
2017-07-31 23:40:00    4121.0
2017-07-31 23:45:00    4121.0
2017-07-31 23:50:00    4122.0
2017-07-31 23:55:00    4122.0
Length: 8928, dtype: float64

In [54]:
df_count[ df_count > 5]

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:05:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:10:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:15:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [77]:
hej = df_count.where(df_count > 5)
hej

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:05:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:10:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:15:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [88]:
roll_neighbors.where(df_count > 5).notnull().astype(int).sum(axis=1)

time
2017-07-01 00:00:00     0
2017-07-01 00:05:00     0
2017-07-01 00:10:00     0
2017-07-01 00:15:00     0
2017-07-01 00:20:00     0
                       ..
2017-07-31 23:35:00    19
2017-07-31 23:40:00    19
2017-07-31 23:45:00    19
2017-07-31 23:50:00    19
2017-07-31 23:55:00    19
Length: 8928, dtype: int64

In [81]:
hej.notnull().astype(int).sum(axis=1)

time
2017-07-01 00:00:00     0
2017-07-01 00:05:00     0
2017-07-01 00:10:00     0
2017-07-01 00:15:00     0
2017-07-01 00:20:00     0
                       ..
2017-07-31 23:35:00    20
2017-07-31 23:40:00    20
2017-07-31 23:45:00    20
2017-07-31 23:50:00    20
2017-07-31 23:55:00    20
Length: 8928, dtype: int64

In [73]:
test1 = df_count.where(df_count > 5, other=pd.NA)
test1

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:05:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:10:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:15:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [74]:
# test1.where(test1 is not np.nan, other= 1 )
test1.where(test1 == pd.NA, other= 1)

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-01 00:05:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-01 00:10:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-01 00:15:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-01 00:20:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-31 23:40:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-31 23:45:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2017-07-31 23:50:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [67]:
test1

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:05:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:10:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:15:00,,,,,,,,,,,...,,,,,,,,,,
2017-07-01 00:20:00,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,190.0,216.0,187.0,191.0,,206.0,169.0,296.0,142.0,,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [49]:
df_count = (df_all > 0).rolling(mint, min_periods=1).sum()
df_count

Unnamed: 0_level_0,rainfall,ams2,ams3,ams4,ams5,ams6,ams7,ams8,ams9,ams10,...,ams14,ams15,ams16,ams17,ams19,ams20,ams21,ams23,ams24,ams26
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-07-01 00:00:00,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
2017-07-01 00:05:00,2.0,2.0,0.0,2.0,0.0,2.0,2.0,2.0,1.0,0.0,...,2.0,1.0,2.0,2.0,2.0,0.0,1.0,2.0,2.0,2.0
2017-07-01 00:10:00,3.0,3.0,0.0,3.0,0.0,3.0,3.0,3.0,2.0,0.0,...,2.0,2.0,3.0,3.0,3.0,0.0,2.0,3.0,3.0,3.0
2017-07-01 00:15:00,4.0,4.0,0.0,4.0,0.0,4.0,4.0,4.0,3.0,0.0,...,3.0,3.0,4.0,4.0,4.0,0.0,3.0,4.0,4.0,4.0
2017-07-01 00:20:00,5.0,5.0,0.0,5.0,0.0,5.0,5.0,5.0,4.0,0.0,...,4.0,4.0,5.0,5.0,5.0,0.0,4.0,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31 23:35:00,190.0,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:40:00,190.0,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:45:00,190.0,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,278.0
2017-07-31 23:50:00,190.0,216.0,187.0,191.0,0.0,206.0,169.0,296.0,142.0,2.0,...,196.0,209.0,166.0,174.0,206.0,199.0,194.0,218.0,256.0,279.0


In [None]:
# This is the count of wet neighbors for each time stamp
neighbors_wet_count = (da_station_and_neighbors.rainfall_amount.sel(id=neighbor_ids) > 0).sum(dim='id')
# This is a boolean array with True for wet station to be flagged
station_wet = da_station_and_neighbors.rainfall_amount.sel(id=current_station_id) > 0

neighbors_wet_count_and_station_wet = neighbors_wet_count.where(station_wet)
matches = neighbors_wet_count_and_station_wet.rolling().sum()

In [None]:
# if rainfall < mmatch at station AND at least nstat nbrs with rainfall above mmatch

## SO-filter - remember to discard the first mint timesteps!

In [7]:
def so_filter(da_station, da_neighbors, window_length):
    
    s_station = da_station.to_series()
    s_neighbors = da_neighbors.to_series()

    #create dataframe of neighboring stations
    df = da_neighbors.to_dataframe()
    drop_columns = ['elevation', 'latitude','longitude','x','y']
    df = df.drop(drop_columns, axis = 1)
    df = df["rainfall"].unstack("id")

    corr = s_station.rolling(window_length, min_periods=1).corr(s_neighbors)

    # number of matching rainy time steps in evaluation period
    number_of_rainy_neighbors  = (df > 0).sum(axis=1)
    rainy_timestep_at_station = s_station > 0
    matches = number_of_rainy_neighbors.where(rainy_timestep_at_station).rolling(mint, min_periods=1).sum()
    ds = xr.Dataset.from_dataframe(pd.DataFrame({'corr': corr}))
    ds['matches'] = xr.DataArray.from_series(matches)
    
    return ds

In [8]:
%%time

for i in [0]: # range(len(ds_pws.id)):
    
    # create data set for station i
    ds_station = ds_pws.isel(id=i) 
    pws_id = ds_station.id.values

    # picking stations within max_distnance, excluding itself, for the whole duration of the time series
    neighbor_ids = distance_matrix.id.data[(distance_matrix.sel(id=pws_id) < max_distance) & (distance_matrix.sel(id=pws_id) > 0)]

    #create data set for neighbors
    ds_neighbors = ds_pws.sel(id=neighbor_ids)

    # if there are no observations in the time series, filter cannot be applied to the whole time series
    if ds_pws.rainfall.sel(id=pws_id).isnull().all():
        print(pws_id, " has no data")
        ds_pws.so_flag[i, :] = -1
        ds_pws.median_corr_nbrs[i,:] = -1
        continue

    # if there are not enough stations nearby, filter cannot be applied to the whole time series
    elif (len(neighbor_ids) < n_stat):
        print(pws_id, "has less than", n_stat, "neighbors")
        ds_pws.so_flag[i, :] = -1
        ds_pws.median_corr_nbrs[i,:] = -1
        continue 
        
    else: 

    # run so-filter
        ds_so_filter = so_filter(ds_station.rainfall, ds_neighbors.rainfall, window_length=mint)

        median_correlation = ds_so_filter.corr.median(dim='id')
        so_array = (median_correlation < gamma).astype(int)
        
    # check if enough matching rainy intervals in the rolling (fixed) evaluation period
        ds_pws.so_flag[i] = xr.where(ds_so_filter.matches < mmatch, -1, so_array)
        ds_pws.median_corr_nbrs[i] = median_correlation


CPU times: total: 688 ms
Wall time: 669 ms


## Bias (next step to fix)
#### rolling bias, MEDIAN not mean!

In [None]:
# one bias correction factor per station, iteratively updated PER TIME STEP ??
BCF_prev = dbc

In [6]:
ds_pws

In [7]:
# initialize 
ds_pws["bias"] = xr.DataArray(np.ones((len(ds_pws.id), len(ds_pws.time)))*-999, dims=("id", "time"))

In [8]:
#bias for the whole time series with nbrs, one station
i = 0

ds_station = ds_pws.isel(id=i) 
pws_id = ds_station.id.values

neighbor_ids = distance_matrix.id.data[(distance_matrix.sel(id=pws_id) < max_distance) & (distance_matrix.sel(id=pws_id) > 0)]
ds_neighbors = ds_pws.sel(id=neighbor_ids)

s_station = ds_station.rainfall.to_series()
s_neighbors = ds_neighbors.rainfall.to_series()

roll_mean = s_neighbors.rolling(mint, min_periods=1).mean()
# rolling mean of neighbors, ie reference rainfall

# delta_r = s_station - s_neighbors
# delta_r
# delta_r = ds_station.rainfall - ds_neighbors.rainfall
# bias = np.nanmean(delta_r) / np.nanmean(ds_neighbors.rainfall)

# how to make rolling bias? 

In [52]:
reference_rainfall = ds_neighbors.rainfall.mean(dim = "id") # can be radar or other source
delta_r = ds_station.rainfall - reference_rainfall

In [59]:
bias = delta_r / reference_rainfall
bias

In [16]:
def bias(station_rainfall, reference_rainfall):
    delta_r = station_rainfall - reference_rainfall
    return np.nanmean(delta_r) / np.nanmean(reference_rainfall)

In [None]:
def bias_test(da_station, da_neighbors, window_length):
    s_station = da_station.to_series()
    s_neighbors = da_neighbors.to_series()
    bias = np.nanmean(delta_r) / np.nanmean(reference_rainfall)
    # roll_bias = s_station.rolling(window_length, min_periods=1).corr(s_neighbors)

In [17]:
# add flags to dataset
# ds_pws["bias"] = (("id", "time"), biascorrectiontable)

In [18]:
# ds_pws.to_netcdf('C:/Users/a002461/OPENSENSE/data/OpenSense_PWS_example_format_data_SO_flags.nc')

In [19]:
#so_flag = pypwsqc.flagging.so_filter(
#    ds_pws.rainfall,
#    ds_pws.nbrs_not_nan,
#    ds_pws.reference,
#    distance_matrix,
#    mint = 4032,
#    mrain = 100,
#    mmatch = 200,
#    gamma = 0.15,
#   beta = 0.2,
#    n_stat = 5,
#    max_distance = 10e3,
#)