# Data Exploration

In [None]:
import sys
sys.path.append('src')
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import inspect
import matplotlib.pyplot as plt
import os.path as osp
import pickle
from sklearn.model_selection import train_test_split
# Local Modules
from utils import make_st_map_interactive
from data_funcs import train_test_split_spacetime
from metrics import ros
import reproducibility

## Cross-Validation Setup 

To get reliable estimates of forecast error of a spatiotemporal model, care must be taken to avoid data leakage. See: https://github.com/jh-206/FRAMSC-2024---FMDA-Data-and-CV-Methods/blob/main/Spatiotemporal%20Cross%20Validation.ipynb

In [None]:
df = pd.read_pickle("data/raws_df0.pkl")

In [None]:
# make_st_map_interactive(df)

In [None]:
print(f"Total Locations: {len(df.STID.unique())}")
print(f"Earliest Time: {df.index.min()}")
print(f"Lates Time: {df.index.max()}")
df.head()

## Broken Sensor Data Filters

Some RAWS fuel moisture sensors are obviously faulty. Such as below,

In [None]:
st = "SAWC2"
month = 2
year = 2024
df_temp = df[(df.index.month == month) & (df.index.year == year)]
plt.plot(df_temp[df_temp.STID == st]['fm'])
plt.xticks(rotation=90, fontsize=8)
plt.show()

In order to flag periods of data such as above, we will remove all observations for a station if the number of hours where the fuel moisture content is identical to the previous hour is greater than 24. So this would mean there is at least 1 full day of data that is suspect. 

In [None]:
# Function given station and time period, return if first difference of FM is zero for greater than 24 hours of observations
def detect_bad_data(df0):
    lags = np.diff(df_temp[df_temp.STID == "SAWC2"]['fm'], n=1)
    zero_lags = np.count_nonzero(lags == 0)
    return zero_lags > 24

In [None]:
month_year = df.index.to_period('M').unique()
flags = []
for my in month_year:
    print("~"*50)
    month = my.month
    year = my.year
    for st in df.STID.unique():
        print(f"Detecting bad data for month: {my}, and Station: {st}")
        df_temp = df[(df.index.month == month) & (df.index.year == year) & (df.STID == st)]
        bad = detect_bad_data(df_temp)
        if bad:
            print(bad)
            flags.append({'STID': st,'my': my})

In [None]:
flags

All of the flagged periods were from the same station, so we will just drop this one entirely.

In [None]:
# remove bad station from data
df = df[df['STID'] != 'SAWC2']

# Write Dataframe
with open(osp.join("data", "raws_df.pkl"), 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Summary Stats

In [None]:
print(f"Min Date: {df.date.min()}")
print(f"Max Date: {df.date.max()}")

In [None]:
df.shape

In [None]:
cols_list = ["Ed", "rain", "wind", "solar", "hour", "doy", "lat", "lon", "elev"]
summary_df = df[cols_list].agg(['min', 'max', 'mean']).transpose()
summary_df = summary_df.round(2)
summary_df = summary_df.map(lambda x: '{:.0f}'.format(x) if x.is_integer() else '{:.2f}'.format(x))
summary_df

In [None]:
print(f"Data Shape: {df.shape}")

In [None]:
print(summary_df.to_latex())

## Spatiotemporal CV

For a meaningful analysis of forecast error for a spatiotemporal model, the test set must consist of locations that were not included in the training and at times in the future of training. To conduct this split, we use a custom function `train_test_split_spacetime`, that mimics the return format of the typicaly `sklearn` function `train_test_split`, while accounting for relationships in space and time.

In [None]:
# Print function
print(inspect.getsource(train_test_split_spacetime))

In [None]:
reproducibility.set_seed(42)
X_train, X_test, y_train, y_test = train_test_split_spacetime(
    df, 
    test_days = 2,
    spatial_test_frac = 0.2,
    verbose = True
)

## Split Into Periods

The dataset will be divided into various train/test splits. The dataset will be split up by month, and in each month a train/test split will be made. The test period will be 2 days in each case. 

In [None]:
# Get unique month and year combos in the data
month_year = df.index.to_period('M').unique()
print(month_year)

In [None]:
for my in month_year:
    print("~"*50)
    month = my.month
    year = my.year
    print(f"Splitting data for month: {my}")
    df_temp = df[(df.index.month == month) & (df.index.year == year)]
    print(f"Total observations: {df_temp.shape}")
    X_train, X_test, y_train, y_test = train_test_split_spacetime(
        df_temp, 
        test_days = 2,
        spatial_test_frac = 0.2,
        verbose = True
    )

## Exploratory Plots

### FMC Plots

In [None]:
dat = df[df.STID == "CHAC2"]
times = np.arange(1600, 1750)
plt.figure(figsize=(6, 4))
plt.plot(dat.fm.iloc[times], linestyle='-',c='#8BC084',label='FM Observed')
plt.plot(dat.rain.iloc[times], c='b',label='Rain', alpha=.8)
plt.ylabel("FM (%)")
plt.xlabel("Time")
plt.ylim(0, 14)
plt.xticks(rotation=90, fontsize=8)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
plt.title("RAWS Station CHAC2 - No Rain")
plt.grid()
plt.tight_layout()
plt.savefig('outputs/no_rain_plot.png')
plt.show()

In [None]:
# dat = df[df.STID == "CHAC2"]
times = np.arange(1600, 1820)
plt.figure(figsize=(8, 4))
plt.plot(dat.fm.iloc[times], linestyle='-',c='#8BC084',label='FM Observed')
plt.plot(dat.rain.iloc[times], c='b',label='Rain', alpha=.8)
plt.ylabel("FM (%)")
plt.xlabel("Time")
plt.ylim(0, 27)
plt.xticks(rotation=90, fontsize=8)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
plt.title("RAWS Station CHAC2 - With Rain")
plt.grid()
plt.tight_layout()
plt.savefig('outputs/rain_plot.png')
plt.show()

In [None]:
# dat = df[df.STID == "CHAC2"]
plt.figure(figsize=(6, 4))
times = np.arange(1600, 1750)
plt.plot(dat.fm.iloc[times], linestyle='-',c='#8BC084',label='FM Observed')
plt.plot(dat.rain.iloc[times], c='b',label='Rain', alpha=.8)
plt.plot(dat.Ew.iloc[times],label='Wetting Equilibrium', linestyle='--',c='#7CCCEF', alpha=.8)
plt.plot(dat.Ed.iloc[times],label='Drying Equilibrium', linestyle='--',c='#EF847C', alpha=.8)
plt.ylabel("FM (%)")
plt.xlabel("Time")
plt.ylim(0, 14)
plt.xticks(rotation=90, fontsize=8)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize='small')
plt.title("RAWS Station CHAC2 - No Rain")
plt.grid()
plt.tight_layout()
plt.savefig('outputs/eq_plot.png')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
hours = 1000
ax1.scatter(dat['rh'][0:hours], dat['fm'][0:hours], alpha=.8)
ax1.set_ylabel("Fuel Moisture (%)")
ax1.set_xlabel("Relative Humidity (%)")
ax2.scatter(dat['temp'][0:hours], dat['fm'][0:hours], alpha=.8)
ax2.set_xlabel("Temp (deg K)")
plt.savefig('outputs/rh_temp_plot.png')

In [None]:
dat = dat.dropna()
print(f"RH correlation with FMC: {np.corrcoef(dat['rh'][0:hours], dat['fm'][0:hours])[0,1]}")
print(f"Temp correlation with FMC: {np.corrcoef(dat['temp'][0:hours], dat['fm'][0:hours])[0,1]}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.scatter(dat['rh'][0:hours], dat['Ew'][0:hours], alpha=.8)
ax1.set_ylabel("Equilibrium FM (%)")
ax1.set_xlabel("Relative Humidity (%)")
ax2.scatter(dat['temp'][0:hours], dat['Ew'][0:hours], alpha=.8)
ax2.set_xlabel("Temp (deg K)")
plt.savefig('outputs/eq_rh_temp_plot.png')

In [None]:
print(f"RH correlation with EQ: {np.corrcoef(dat['rh'][0:hours], dat['Ew'][0:hours])[0,1]}")
print(f"Temp correlation with EQ: {np.corrcoef(dat['temp'][0:hours], dat['Ew'][0:hours])[0,1]}")

### ROS Plots