In [1]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

In [30]:
# Import data

DATA_PATH = 'Reservoir_Project/Data'

In [31]:
basin_inflow = pd.read_excel(f'{DATA_PATH}/Custom/basin_inflow_no_dates.xlsx', index_col=0)

In [32]:
basin_inflow.head()

Unnamed: 0,INFLOW,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,ADR_TEMP_MIN,HYS_PRECIP_ACC,HYS_PRECIP_INCR,HYS_SNOW_DEPTH,HYS_SNOW_WATER_CONTENT,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,27.666,9.52,0.0,44,58,34,20.24,0.0,38,7.67,...,30,7.9,44,55,37,7.72,0.04,40,61,30
1,30.044,9.52,0.0,47,57,39,20.36,0.12,38,7.79,...,30,7.88,38,46,32,7.76,0.04,39,63,29
2,48.478,9.76,0.24,45,50,38,20.36,0.0,39,7.79,...,30,7.86,30,35,25,7.76,0.0,40,53,29
3,90.246,11.28,1.52,46,50,42,25.76,5.4,45,8.15,...,33,8.27,31,33,28,11.0,3.24,39,47,32
4,200.2,11.96,0.68,41,46,37,27.44,1.68,58,11.75,...,33,8.27,31,33,28,12.04,1.04,33,38,32


In [33]:
basin_inflow.columns

Index(['INFLOW', 'ADR_PRECIP_ACC', 'ADR_PRECIP_INCR', 'ADR_TEMP_AVG',
       'ADR_TEMP_MAX', 'ADR_TEMP_MIN', 'HYS_PRECIP_ACC', 'HYS_PRECIP_INCR',
       'HYS_SNOW_DEPTH', 'HYS_SNOW_WATER_CONTENT', 'HYS_TEMP_AVG',
       'HYS_TEMP_MAX', 'HYS_TEMP_MIN', 'DUN_PRECIP_ACC', 'DUN_WIND_GUST',
       'DUN_WIND_SPEED', 'DUN_TEMP_AVG', 'DUN_TEMP_MAX', 'DUN_TEMP_MIN',
       'SGP_PRECIP_ACC', 'SGP_PRECIP_INCR', 'SGP_TEMP_AVG', 'SGP_TEMP_MAX',
       'SGP_TEMP_MIN', 'FRN_PRECIP_ACC', 'FRN_PRECIP_INCR', 'FRN_SNOW_DEPTH',
       'FRN_SNOW_WATER_CONTENT', 'FRN_TEMP_AVG', 'FRN_TEMP_MAX',
       'FRN_TEMP_MIN', 'PFH_PRECIP_ACC', 'PFH_PRECIP_INCR', 'PFH_TEMP_AVG',
       'PFH_TEMP_MAX', 'PFH_TEMP_MIN'],
      dtype='object')

#### Exponential moving average smoothing

In [34]:
"""
Exponential moving average smoothing

- Uses past 30 periods or days to inform prediction (give them more weight) in order to capture trends
- Span corresponds to what is commonly called an “N-day EW moving average”
"""

def smooth(df):
  for col in df.columns:
    df[col] = df[col].ewm(span=30.0, ignore_na=True).mean(engine='numba')
  
  return df 

In [35]:
smoothed_basin_inflow = smooth(basin_inflow.copy())
smoothed_basin_inflow.head()

Unnamed: 0,INFLOW,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,ADR_TEMP_MIN,HYS_PRECIP_ACC,HYS_PRECIP_INCR,HYS_SNOW_DEPTH,HYS_SNOW_WATER_CONTENT,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,27.666,9.52,0.0,44.0,58.0,34.0,20.24,0.0,38.0,7.67,...,30.0,7.9,44.0,55.0,37.0,7.72,0.04,40.0,61.0,30.0
1,28.894633,9.52,0.0,45.55,57.483333,36.583333,20.302,0.062,38.0,7.732,...,30.0,7.889667,40.9,50.35,34.416667,7.740667,0.04,39.483333,62.033333,29.483333
2,35.862281,9.605391,0.085391,45.354313,54.820807,37.087375,20.322636,0.039941,38.355794,7.752636,...,30.0,7.879111,37.021844,44.88856,31.066272,7.747545,0.025768,39.66716,58.819326,29.311366
3,50.846977,10.066807,0.480678,45.532223,53.492499,38.440982,21.820828,1.516832,40.186515,7.862124,...,30.826609,7.986815,35.362606,41.612828,30.221402,8.643715,0.911406,39.483333,55.562671,30.052183
4,84.828406,10.497554,0.526028,44.501033,51.787774,38.113124,23.099326,1.553957,44.239514,8.74671,...,31.321108,8.051247,34.370008,39.653201,29.715979,9.416452,0.940664,38.008218,51.566738,30.495358


#### Remove seasonality

In [36]:
"""
Make data stationary

Seasonal data is differenced by substracting an observation from the same time in the previous cycle.
"""

def difference(dataset, interval=1):
    cols = []
    for col in dataset.columns: 
        col_data = dataset[col]
        series = col_data.values
        diff = list()
        for i in range(interval, len(series)):
            value = series[i] - series[i - interval]
            diff.append(value)
        cols.append(pd.Series(diff))
    
    return pd.concat(cols, axis=1)

In [37]:
days = 365

stationary_basin_inflow = difference(smoothed_basin_inflow, days)
stationary_basin_inflow.index = smoothed_basin_inflow.index[days:]
stationary_basin_inflow.columns = smoothed_basin_inflow.columns
stationary_basin_inflow.reset_index(drop=True, inplace=True)
print("Null/NaN count: ", stationary_basin_inflow.isnull().sum().sum())

Null/NaN count:  0


In [38]:
stationary_basin_inflow.head()

Unnamed: 0,INFLOW,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,ADR_TEMP_MIN,HYS_PRECIP_ACC,HYS_PRECIP_INCR,HYS_SNOW_DEPTH,HYS_SNOW_WATER_CONTENT,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,8.944327,-4.372491,0.151906,0.121939,-6.413691,4.031741,-7.699793,0.499151,-24.334203,-5.572962,...,-17.288084,-1.700984,-11.105608,-14.746997,-9.483165,-0.778717,0.202783,-1.782846,-9.608046,0.235409
1,9.341866,-4.198782,0.178235,-1.693993,-6.386464,1.12378,-7.166968,0.546883,-21.861028,-5.259932,...,-16.559821,-1.572523,-8.127827,-10.629448,-6.804144,-0.470434,0.295506,-1.667286,-11.569893,0.672372
2,3.619606,-4.12167,0.081345,-1.940629,-4.052768,0.122505,-6.579541,0.581273,-19.903208,-4.836831,...,-15.878542,-1.451461,-4.364004,-5.666108,-3.364234,-0.164425,0.293254,-2.484406,-9.030946,0.189132
3,-11.380245,-4.431068,-0.324699,-2.338777,-2.903043,-1.373675,-7.501159,-0.927955,-19.827644,-4.53121,...,-16.067826,-1.455787,-2.811724,-2.856341,-2.435625,-0.765312,-0.610386,-2.63495,-6.018703,-0.842039
4,-45.496431,-4.719605,-0.380113,-1.320067,-1.107315,-1.114675,-8.229958,-0.992749,-22.226377,-5.027467,...,-14.41773,-1.423511,-1.661118,-0.622939,-1.787349,-1.259236,-0.656484,-1.150053,-1.864316,-1.16974


#### Normalization

In [39]:
"""
Normalizing data

Maximum absolute scaling rescales each feature between -1 and 1 by dividing every observation 
by its maximum absolute value.
"""
  
def normalize(df):
    for col in df.columns:
        df[col] = df[col] / df[col].abs().max()

    return df

In [40]:
normalized_basin_inflow = normalize(stationary_basin_inflow)

In [41]:
normalized_basin_inflow.head()

Unnamed: 0,INFLOW,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,ADR_TEMP_MIN,HYS_PRECIP_ACC,HYS_PRECIP_INCR,HYS_SNOW_DEPTH,HYS_SNOW_WATER_CONTENT,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,0.009407,-0.120687,0.184068,0.009505,-0.391558,0.331857,-0.064454,0.309236,-0.160654,-0.115369,...,-0.094061,-0.026854,-0.467328,-0.565277,-0.419978,-0.011883,0.153432,-0.125129,-0.568938,0.021703
1,0.009825,-0.115892,0.215971,-0.132046,-0.389896,0.0925,-0.059993,0.338807,-0.144326,-0.108889,...,-0.090099,-0.024826,-0.342022,-0.407445,-0.301333,-0.007179,0.22359,-0.117018,-0.685109,0.061987
2,0.003807,-0.113764,0.098568,-0.151271,-0.247423,0.010084,-0.055076,0.360112,-0.1314,-0.10013,...,-0.086392,-0.022915,-0.183639,-0.217192,-0.148991,-0.002509,0.221885,-0.174368,-0.534766,0.017436
3,-0.011969,-0.122304,-0.393446,-0.182306,-0.177232,-0.113069,-0.062791,-0.574891,-0.130901,-0.093803,...,-0.087422,-0.022983,-0.118318,-0.109488,-0.107866,-0.011679,-0.461838,-0.184934,-0.356396,-0.077629
4,-0.047851,-0.130268,-0.460592,-0.102898,-0.067602,-0.09175,-0.068891,-0.615032,-0.146738,-0.104076,...,-0.078444,-0.022474,-0.0699,-0.023878,-0.079156,-0.019216,-0.496717,-0.080716,-0.110395,-0.10784


#### Split data

In [42]:
# 70/20/10 split for training, validation, and test sets

df_size = len(normalized_basin_inflow)

train_df = normalized_basin_inflow[0:int(df_size * 0.7)] # 70% 

val_df = normalized_basin_inflow[int(df_size * 0.7):int(df_size * 0.9)] # next 20%
val_df.reset_index(drop=True, inplace=True)

test_df = normalized_basin_inflow[int(df_size * 0.9):] # last 10%
test_df.reset_index(drop=True, inplace=True)

train_df.to_excel(f'{DATA_PATH}/Custom/basin_inflow_train.xlsx') 
val_df.to_excel(f'{DATA_PATH}/Custom/basin_inflow_validation.xlsx') 
test_df.to_excel(f'{DATA_PATH}/Custom/basin_inflow_test.xlsx') 