In [1]:
import pandas as pd, os
import bayes_net_utils as bn
pd.options.display.width=None

# Bayesian network predictions

The R code required to run the Bayesian network and generate predictions has been refactored into the R function `bayes_net_predict` in `bayes_net_utils.R`. There is also a Python function of the same name in `bayes_net_utils.py`, which provides a simple "wrapper" around the R fucntion and some minor additional calculations. This should make it easy to make predictions from the Bayesian network via Python.

**Note:** There is some computational overhead involved in interfacing between Python and R, but this isn't a major problem.

# User options

In [2]:
# User options

run_mode = 'Historic'        # Run mode? 'Historic' for period 1981-2018/19, or 'NextSeason' for future (operational, or one historic test season)
if run_mode == 'NextSeason': # If making predictions for the next season, for which year? For file reading
    target_yr = 2020

met_evidence = 'era5'  # Source of met data used to create data for driving predictions? 'metno', 'era5' or 's5'

# Use dictionary to automatically set the met data used in network training based on the source of data used to drive predictions.
# If met data for predictions is not s5, should be the same as met_evidence. If 's5', should be 'era5' as that was used in bias correcting s5
met_training_dict = {'metno':'metno',
                    'era5':'era5',
                    's5':'era5'}
met_training = met_training_dict[met_evidence]

# Start and end years of data used to fit network (used in the .rds filepath) and, for Historic run_mode, in generating the data for prediction
# (and in the filepaths to these csvs)
st_end_yr_dict = {'metno': [1981,2018],
               'era5': [1981,2019],
               's5': [1993,2019]}

# Fitted bnlearn object
rfile_fpath = "../Data/RData/Vansjo_fitted_GaussianBN_%s_%s-%s.rds" %(met_training, st_end_yr_dict[met_training][0], st_end_yr_dict[met_training][1])

# Pre-calculated standard deviations
sd_fpath = "../Data/FittedNetworkDiagnostics/GBN_%s_%s-%s_stdevs.csv" %(met_training, st_end_yr_dict[met_training][0], st_end_yr_dict[met_training][1])

# The 'evidence' (data that will be used to drive the predictions) folder
ev_folder = r'../Data/DataForPrediction/%s/%s' %(run_mode, met_evidence)

# Outfolder to save predictions in
out_folder = r'../Data/Predictions/%s' %run_mode

# Function to predict multiple years at once

If you are just predicting for one season, you can use bn.bayes_net_predict by itself. The function below works too, but is particularly useful for producing predictions for all years in a historic test period.

In [3]:
def bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df):
    """
    Loop over rows in evidence dataframe and make predictions for each row (year), and concatenate results into a
    single df
    """
    df_list = []
    for idx, row in ev_df.iterrows():
        # Run Bayesian network in R
        df = bn.bayes_net_predict(rfile_fpath,
                                  sd_fpath,
                                  float(row['year']),
                                  float(row['chla_prevSummer']),
                                  float(row['colour_prevSummer']),
                                  float(row['TP_prevSummer']),
                                  float(row['wind_speed']),
                                  float(row['rain']),
                                 )
    #     # Add 'year' to results as unique identifier
    #     df['year'] = int(row['year'])
        df_list.append(df)

    # Merge results from all years
    df = pd.concat(df_list, sort=True)
    df.reset_index(drop=True, inplace=True)

    # Re-order cols
    df = df[['year', 'node', 'threshold','prob_below_threshold', 
             'prob_above_threshold', 'expected_value', 'sd','WFD_class']]
    
    return df

# Predictions for 'deterministic' met data (e.g. met.no or ERA5)

Where there is just a single 'evidence' datafile

In [4]:
if met_evidence !='s5':
    
    # Sort out filepaths for the evidence data to read in and the output file
    if run_mode == 'NextSeason':
        ev_fname = 'DataForPrediction_GBN_%s_%s.csv' %(met_evidence, target_yr)
        out_fname = 'GBN_prediction_%s_%s.csv' %(met_evidence, target_yr)
    else:
        ev_fname = 'DataForPrediction_GBN_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
        out_fname = 'GBN_prediction_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
        
    ev_path = os.path.join(ev_folder, ev_fname)
    out_path = os.path.join(out_folder, out_fname)
    
    # Read in evidence and optionally display
    ev_df = pd.read_csv(ev_path)
    
#     display(ev_df.head())
    
    # Predict and save to csv
    df = bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df)
    
    df.to_csv(out_path, index=False)

    display(df)

Unnamed: 0,year,node,threshold,prob_below_threshold,prob_above_threshold,expected_value,sd,WFD_class
0,1981,chla,20.0,0.57,0.43,19.300,3.760,0
1,1981,colour,48.0,0.98,0.02,28.800,9.040,0
2,1981,cyano,1.0,0.23,0.77,2.090,0.719,1
3,1981,TP,29.5,0.02,0.98,37.000,3.790,1
4,1982,chla,20.0,0.98,0.02,11.100,3.760,0
...,...,...,...,...,...,...,...,...
151,2018,TP,29.5,0.95,0.05,23.200,3.790,0
152,2019,chla,20.0,0.98,0.02,11.100,3.760,0
153,2019,colour,48.0,0.75,0.25,42.000,9.040,0
154,2019,cyano,1.0,0.83,0.17,0.369,0.719,0


# Predictions using evidence derived from seasonal forecast data

Where there may be multiple seasons and members. Currently set up for System5.

In [5]:
if met_evidence == 's5':
    
    member_li = ["%.2d" % i for i in range(1,26)] # List of S5 member numbers in format '01','02'... Should be present in s5 met data folder
    season_li = ['summer','late_summer'] # Seasons of interest (must match filenames in s5 met data folder)

    for season in season_li:
        for member in member_li:

            # Sort out filepaths for the evidence data to read in and the output file
            if run_mode == 'NextSeason':
                ev_fname = 'DataForPrediction_GBN_%s_%s_%s_%s.csv' %(met_evidence, target_yr, season, member)
                out_fname = 'GBN_prediction_%s_%s_%s_%s.csv' %(met_evidence, target_yr, season, member)
            else:
                ev_fname = 'DataForPrediction_GBN_%s_%s-%s_%s_%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1], season, member)
                out_fname = 'GBN_prediction_%s_%s-%s_%s_%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1], season, member)

            ev_path = os.path.join(ev_folder, ev_fname)
            out_path = os.path.join(out_folder, 's5', out_fname)

            # Read in evidence
            ev_df = pd.read_csv(ev_path)

            # Predict and save to csv
            df = bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df)
            df.to_csv(out_path, index=False)

    # Display output for the last season and member for checking
    display(df)

# Simplest possible model: target season = previous season

In [11]:
obs_fpath = '../Data/DataMatrices/Seasonal_BN_obs/seasonal_obs_GBN_1980-2019.csv'

# Read in evidence and optionally display
obs_df = pd.read_csv(obs_fpath, index_col=0)
# display(obs_df.head())

# Fill NaNs in water chemistry and ecology (linearly interpolate and backwards fill)
# obs_df.interpolate(method='linear',limit=1, inplace=True)

# Predict and save to csv
sim_df = obs_df.shift(+1).loc[1981:]

# Save to csv
if run_mode == 'NextSeason':
    out_fname = 'Prediction_naive_%s.csv' %(target_yr)
else:
    out_fname = 'Prediction_naive_%s-%s.csv' %(st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
out_path = os.path.join(out_folder, out_fname)

sim_df.to_csv(out_path)

display(sim_df)

Unnamed: 0_level_0,TP,chla,colour,cyano
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1981,43.573016,16.08013,,
1982,28.833333,8.33125,,
1983,26.988095,5.975,26.666667,
1984,29.78125,6.05,17.625,
1985,26.5275,11.09,,
1986,30.957143,11.895238,34.404762,
1987,33.788889,12.677778,27.472222,
1988,23.433333,8.511111,29.027778,
1989,29.533333,14.055556,32.277778,
1990,27.057143,14.271429,23.857143,
