In [2]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

In [3]:
# Import data

DATA_PATH = 'Reservoir_Project/Data'

In [4]:
basin_inflow = pd.read_excel(f'{DATA_PATH}/Custom/basin_inflow_no_dates.xlsx', index_col=0)

In [5]:
basin_inflow.head()

Unnamed: 0,INFLOW,NFD_MEAN_FLOW,OXB_RIVER_STAGE,OXB_RIVER_DISCHARGE,CBR_RIVER_STAGE,CBR_RIVER_DISCHARGE,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,977,152,8.585417,169.666667,4.445833,281.045455,9.52,0.0,44,58,...,30,7.9,44,55,37,7.72,0.04,40,61,30
1,1061,134,8.487917,153.958333,2.29875,722.583333,9.52,0.0,47,57,...,30,7.88,38,46,32,7.76,0.04,39,63,29
2,1712,126,8.343333,132.75,2.993333,1115.916667,9.76,0.24,45,50,...,30,7.86,30,35,25,7.76,0.0,40,53,29
3,1712,126,9.907083,752.833333,2.649583,866.958333,11.28,1.52,46,50,...,33,8.27,31,33,28,11.0,3.24,39,47,32
4,7072,126,10.8825,927.25,3.432083,1273.791667,11.96,0.68,41,46,...,33,8.27,31,33,28,12.04,1.04,33,38,32


In [6]:
basin_inflow.columns

Index(['INFLOW', 'NFD_MEAN_FLOW', 'OXB_RIVER_STAGE', 'OXB_RIVER_DISCHARGE',
       'CBR_RIVER_STAGE', 'CBR_RIVER_DISCHARGE', 'ADR_PRECIP_ACC',
       'ADR_PRECIP_INCR', 'ADR_TEMP_AVG', 'ADR_TEMP_MAX', 'ADR_TEMP_MIN',
       'HYS_PRECIP_ACC', 'HYS_PRECIP_INCR', 'HYS_SNOW_DEPTH',
       'HYS_SNOW_WATER_CONTENT', 'HYS_TEMP_AVG', 'HYS_TEMP_MAX',
       'HYS_TEMP_MIN', 'DUN_PRECIP_ACC', 'DUN_TEMP_AVG', 'DUN_TEMP_MAX',
       'DUN_TEMP_MIN', 'SGP_PRECIP_ACC', 'SGP_PRECIP_INCR', 'SGP_TEMP_AVG',
       'SGP_TEMP_MAX', 'SGP_TEMP_MIN', 'FRN_PRECIP_ACC', 'FRN_PRECIP_INCR',
       'FRN_SNOW_DEPTH', 'FRN_SNOW_WATER_CONTENT', 'FRN_TEMP_AVG',
       'FRN_TEMP_MAX', 'FRN_TEMP_MIN', 'PFH_PRECIP_ACC', 'PFH_PRECIP_INCR',
       'PFH_TEMP_AVG', 'PFH_TEMP_MAX', 'PFH_TEMP_MIN'],
      dtype='object')

#### Exponential moving average smoothing

In [7]:
"""
Exponential moving average smoothing

- Uses past 30 periods or days to inform prediction (give them more weight) in order to capture trends
- Span corresponds to what is commonly called an “N-day EW moving average”
"""

def smooth(df):
  for col in df.columns:
    df[col] = df[col].ewm(span=30.0, ignore_na=True).mean(engine='numba')
  
  return df 

In [8]:
smoothed_basin_inflow = smooth(basin_inflow.copy())
smoothed_basin_inflow.head()

Unnamed: 0,INFLOW,NFD_MEAN_FLOW,OXB_RIVER_STAGE,OXB_RIVER_DISCHARGE,CBR_RIVER_STAGE,CBR_RIVER_DISCHARGE,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,977.0,152.0,8.585417,169.666667,4.445833,281.045455,9.52,0.0,44.0,58.0,...,30.0,7.9,44.0,55.0,37.0,7.72,0.04,40.0,61.0,30.0
1,1020.4,142.7,8.535042,161.550694,3.336507,509.173359,9.52,0.0,45.55,57.483333,...,30.0,7.889667,40.9,50.35,34.416667,7.740667,0.04,39.483333,62.033333,29.483333
2,1266.467234,136.758238,8.466833,151.303576,3.214408,725.049078,9.605391,0.085391,45.354313,54.820807,...,30.0,7.879111,37.021844,44.88856,31.066272,7.747545,0.025768,39.66716,58.819326,29.311366
3,1389.227747,133.793951,8.863674,317.046944,3.058778,764.150249,10.066807,0.480678,45.532223,53.492499,...,30.826609,7.986815,35.362606,41.612828,30.221402,8.643715,0.911406,39.483333,55.562671,30.052183
4,2682.19602,132.020639,9.323006,455.882913,3.143714,880.106011,10.497554,0.526028,44.501033,51.787774,...,31.321108,8.051247,34.370008,39.653201,29.715979,9.416452,0.940664,38.008218,51.566738,30.495358


#### Remove seasonality

In [9]:
"""
Make data stationary

Seasonal data is differenced by substracting an observation from the same time in the previous cycle.
"""

def difference(dataset, interval=1):
    cols = []
    for col in dataset.columns: 
        col_data = dataset[col]
        series = col_data.values
        diff = list()
        for i in range(interval, len(series)):
            value = series[i] - series[i - interval]
            diff.append(value)
        cols.append(pd.Series(diff))
    
    return pd.concat(cols, axis=1)

In [10]:
days = 365

stationary_basin_inflow = difference(smoothed_basin_inflow, days)
stationary_basin_inflow.index = smoothed_basin_inflow.index[days:]
stationary_basin_inflow.columns = smoothed_basin_inflow.columns
stationary_basin_inflow.reset_index(drop=True, inplace=True)
print("Null/NaN count: ", stationary_basin_inflow.isnull().sum().sum())

Null/NaN count:  0


In [11]:
stationary_basin_inflow.head()

Unnamed: 0,INFLOW,NFD_MEAN_FLOW,OXB_RIVER_STAGE,OXB_RIVER_DISCHARGE,CBR_RIVER_STAGE,CBR_RIVER_DISCHARGE,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,313.381625,-29.948597,1.267748,328.122057,-2.698053,161.1058,-4.372491,0.151906,0.121939,-6.413691,...,-17.288084,-1.700984,-11.105608,-14.746997,-9.483165,-0.778717,0.202783,-1.782846,-9.608046,0.235409
1,327.569908,11.928732,1.335177,339.472091,-1.59025,-68.803367,-4.198782,0.178235,-1.693993,-6.386464,...,-16.559821,-1.572523,-8.127827,-10.629448,-6.804144,-0.470434,0.295506,-1.667286,-11.569893,0.672372
2,125.633647,35.636382,1.375764,338.836019,-1.489119,-294.267474,-4.12167,0.081345,-1.940629,-4.052768,...,-15.878542,-1.451461,-4.364004,-5.666108,-3.364234,-0.164425,0.293254,-2.484406,-9.030946,0.189132
3,2.479529,43.220371,0.957115,164.341709,-1.319207,-326.421759,-4.431068,-0.324699,-2.338777,-2.903043,...,-16.067826,-1.455787,-2.811724,-2.856341,-2.435625,-0.765312,-0.610386,-2.63495,-6.018703,-0.842039
4,-1295.11502,45.831469,0.489749,22.281741,-1.392986,-436.991724,-4.719605,-0.380113,-1.320067,-1.107315,...,-14.41773,-1.423511,-1.661118,-0.622939,-1.787349,-1.259236,-0.656484,-1.150053,-1.864316,-1.16974


In [12]:
stationary_basin_inflow.to_excel(f'{DATA_PATH}/Custom/stationary_basin_inflow.xlsx')