# Building Training Data for FMC Models

The purpose of this notebook is to combine the weather data from OK Mesonet, and from Van der Kamp when necessary, with the field observations from Carlson into datasets for training and evaluating models of FMC for various fuel classes.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import sys
sys.path.append("src")
from utils import read_yml, plot_styles

In [None]:
df1    = pd.read_excel("data/processed_data/ok_1h.xlsx")
df10   = pd.read_excel("data/processed_data/ok_10h.xlsx")
df100  = pd.read_excel("data/processed_data/ok_100h.xlsx")
df1000 = pd.read_excel("data/processed_data/ok_1000h.xlsx")

dweather = pd.read_excel("data/processed_data/dvdk_weather.xlsx")
mweather = pd.read_excel("data/processed_data/mesonet.xlsx")

In [None]:
required_weather_vars = ["Ed", "Ew", "solar", "wind", "rain"]

## Join Weather

The OK Mesonet data is missing air temp for parts of the study period. Joining with air temp data from Van der Kamp.

* Identify missing data
* Linearly interpolate sections of missing data if less than or equal to 3 consecutive hours
* Fill in remaining missing data from Van der Kamp where applicable

### Handle Missing Observations

Identify areas with spot missing data that can be replaced with interpolation.

In [None]:
dfw = mweather[["date"] + required_weather_vars]
mask = dfw.isna().any(axis=1)

print(f"Number of Half-Hourly Observations from OK Mesonet: {mweather.shape[0]}")
print(f"Number of Half-Hour Periods with Missing Key Variables: {mask.sum()}")

#### Missing Solar

Total missing times: 6

Longest temporal streak of missing data: 1 hr (2 half-hourly observations)

Methodology: manual linear interp

In [None]:
print(f"Number of Missing Solar Radiation Observations: {mweather.solar.isna().sum()}")
mweather[mweather.solar.isna()]

In [None]:
# Interp Values
intp1 = (mweather.solar.iloc[5595 - 1] + mweather.solar.iloc[5595 + 1])/2
mweather.loc[5595, 'solar'] = intp1

m = (mweather.solar.iloc[9537] - mweather.solar.iloc[9534])/3
b = mweather.solar.iloc[9534] - m
intp2a = m*2 + b
intp2b = m*3 + b
mweather.loc[9535:9536, 'solar'] = [
    m*2 + b,
    m*3 + b
]


m = (mweather.solar.iloc[24028] - mweather.solar.iloc[24025])/3
b = mweather.solar.iloc[24025] - m
intp3a = m*2 + b
intp3b = m*3 + b
mweather.loc[24026:24027, 'solar'] = [
    m*2 + b,
    m*3 + b
]

intp4 = (mweather.solar.iloc[26967 - 1] + mweather.solar.iloc[26967 + 1])/2
mweather.loc[26967, 'solar'] = intp4

In [None]:
fmt = mdates.DateFormatter('%Y-%m-%d %H:%M')
fig, ax = plt.subplots(figsize=(10, 16), nrows=4, ncols=1)
locator = mdates.AutoDateLocator()

ax[0].plot(mweather.iloc[5580:5610].date,mweather.iloc[5580:5610].solar, 'o-')
ax[0].plot(mweather.iloc[5594:5597].date, mweather.iloc[5594:5597].solar, 'r-')
ax[0].plot(mweather.iloc[5595].date, intp1, 'ro')

ax[1].plot(mweather.iloc[9525:9545].date,mweather.iloc[9525:9545].solar, 'o-')
ax[1].plot(mweather.iloc[9534:9538].date, mweather.iloc[9534:9538].solar, 'r-')
ax[1].plot(mweather.iloc[9535:9537].date, [intp2a, intp2b], 'ro-')

ax[2].plot(mweather.iloc[24015:24035].date,mweather.iloc[24015:24035].solar, 'o-')
ax[2].plot(mweather.iloc[24025:24029].date, mweather.iloc[24025:24029].solar, 'r-')
ax[2].plot(mweather.iloc[24026:24028].date, [intp3a, intp3b], 'ro-')

ax[3].plot(mweather.iloc[26955:26975].date,mweather.iloc[26955:26975].solar, 'o-')
ax[3].plot(mweather.iloc[26966:26969].date, mweather.iloc[26966:26969].solar, 'r-')
ax[3].plot(mweather.iloc[26967].date, intp4, 'ro')



for a in ax:
    a.xaxis.set_major_locator(locator)
    a.xaxis.set_major_locator(mdates.AutoDateLocator())
    a.xaxis.set_major_formatter(fmt)
    a.tick_params(axis='x', rotation=45)
    a.grid()
    a.set_ylabel(r"Solar Radiation ($\text{Wm}^{-2}$)")

fig.suptitle("Interpolated Solar Radiation", fontsize=16)
fig.tight_layout()

#### Missing Wind

Total missing times: 71

Same spot missing data as for solar, with an additional streak of missing data from 1997-12-21 07:00:00 to 1997-12-22 15:30:00. This stretch will be kept as missing

In [None]:
print(f"Number of Missing Wind Speed Observations: {mweather.wind.isna().sum()}")
mweather[mweather.wind.isna()]

In [None]:
df2 = dfw[dfw.wind.isna()]

In [None]:
# Interp Values
intp1 = (mweather.wind.iloc[5595 - 1] + mweather.wind.iloc[5595 + 1])/2
mweather.loc[5595, 'wind'] = intp1

m = (mweather.wind.iloc[9537] - mweather.wind.iloc[9534])/3
b = mweather.wind.iloc[9534] - m
intp2a = m*2 + b
intp2b = m*3 + b
mweather.loc[9535:9536, 'wind'] = [
    m*2 + b,
    m*3 + b
]


m = (mweather.wind.iloc[24028] - mweather.wind.iloc[24025])/3
b = mweather.wind.iloc[24025] - m
intp3a = m*2 + b
intp3b = m*3 + b
mweather.loc[24026:24027, 'wind'] = [
    m*2 + b,
    m*3 + b
]

intp4 = (mweather.wind.iloc[26967 - 1] + mweather.wind.iloc[26967 + 1])/2
mweather.loc[26967, 'wind'] = intp4

In [None]:
fmt = mdates.DateFormatter('%Y-%m-%d %H:%M')
fig, ax = plt.subplots(figsize=(10, 16), nrows=4, ncols=1)
locator = mdates.AutoDateLocator()

ax[0].plot(mweather.iloc[5580:5610].date,mweather.iloc[5580:5610].wind, 'o-')
ax[0].plot(mweather.iloc[5594:5597].date, mweather.iloc[5594:5597].wind, 'r-')
ax[0].plot(mweather.iloc[5595].date, intp1, 'ro')

ax[1].plot(mweather.iloc[9525:9545].date,mweather.iloc[9525:9545].wind, 'o-')
ax[1].plot(mweather.iloc[9534:9538].date, mweather.iloc[9534:9538].wind, 'r-')
ax[1].plot(mweather.iloc[9535:9537].date, [intp2a, intp2b], 'ro-')

ax[2].plot(mweather.iloc[24015:24035].date,mweather.iloc[24015:24035].wind, 'o-')
ax[2].plot(mweather.iloc[24025:24029].date, mweather.iloc[24025:24029].wind, 'r-')
ax[2].plot(mweather.iloc[24026:24028].date, [intp3a, intp3b], 'ro-')

ax[3].plot(mweather.iloc[26955:26975].date,mweather.iloc[26955:26975].wind, 'o-')
ax[3].plot(mweather.iloc[26966:26969].date, mweather.iloc[26966:26969].wind, 'r-')
ax[3].plot(mweather.iloc[26967].date, intp4, 'ro')



for a in ax:
    a.xaxis.set_major_locator(locator)
    a.xaxis.set_major_locator(mdates.AutoDateLocator())
    a.xaxis.set_major_formatter(fmt)
    a.tick_params(axis='x', rotation=45)
    a.grid()
    a.set_ylabel(r"Wind Speed ($\text{ms}^{-1}$)")

fig.suptitle("Interpolated Wind speed", fontsize=16)
fig.tight_layout()

#### Hourly Rain

Total Missing: 17 

Same spot missing times as solar and wind except shifted, which will be linearly interpolated. Additional missing stretch from 1997-02-27 18:00:00 to 1997-02-28 00:00:00. This stretch will set to zero based on DVDK weather.

In [None]:
print(f"Number of Missing Rain Observations: {mweather.rain.isna().sum()}")
mweather[mweather.rain.isna()]

In [None]:
# Interp Values
intp1 = (mweather.rain.iloc[9536 - 1] + mweather.rain.iloc[9536 + 1])/2
mweather.loc[9536, 'rain'] = intp1


m = (mweather.rain.iloc[24029] - mweather.rain.iloc[24026])/3
b = mweather.rain.iloc[24026] - m
intp2a = m*2 + b
intp2b = m*3 + b
mweather.loc[24027:24028, 'rain'] = [
    m*2 + b,
    m*3 + b
]

intp3 = (mweather.rain.iloc[26968 - 1] + mweather.rain.iloc[26968 + 1])/2
mweather.loc[26968, 'rain'] = intp3

In [None]:
fmt = mdates.DateFormatter('%Y-%m-%d %H:%M')
fig, ax = plt.subplots(figsize=(10, 16), nrows=3, ncols=1)
locator = mdates.AutoDateLocator()


ax[0].plot(mweather.iloc[9526:9546].date,mweather.iloc[9526:9546].rain, 'o-')
ax[0].plot(mweather.iloc[9535:9538].date, mweather.iloc[9535:9538].rain, 'r-')
ax[0].plot(mweather.iloc[9536].date, intp1, 'ro-')

ax[1].plot(mweather.iloc[24015:24035].date,mweather.iloc[24015:24035].rain, 'o-')
ax[1].plot(mweather.iloc[24026:24030].date, mweather.iloc[24026:24030].rain, 'r-')
ax[1].plot(mweather.iloc[24027:24029].date, [intp2a, intp2b], 'ro-')

ax[2].plot(mweather.iloc[26955:26975].date,mweather.iloc[26955:26975].rain, 'o-')
ax[2].plot(mweather.iloc[26967:26970].date, mweather.iloc[26967:26970].rain, 'r-')
ax[2].plot(mweather.iloc[26968].date, intp3, 'ro')



for a in ax:
    a.xaxis.set_major_locator(locator)
    a.xaxis.set_major_locator(mdates.AutoDateLocator())
    a.xaxis.set_major_formatter(fmt)
    a.tick_params(axis='x', rotation=45)
    a.grid()
    a.set_ylabel(r"Rain ($\text{mh}^{-1}$)")

fig.suptitle("Interpolated Hourly Rain", fontsize=16)
fig.tight_layout()

In [None]:
# Check other weather source for rain at that time
dweather[(dweather.date >= mweather[mweather.rain.isna()].date.min()) & (dweather.date <= mweather[mweather.rain.isna()].date.max())].rain

In [None]:
# Check time lags Fill with zeros
print(mweather[mweather.rain.isna()].date.diff().unique())
mweather.loc[mweather["rain"].isna(), "rain"] = 0.0

#### Equilibria

Constructed from air temp and RH. Air temp is missing at Slapout during Jan 1996 - Feb 1997, so filling in some using portable weather info delivered from DVDK. That data was hourly resolution, so for this stretch linearly interpolating to half-hourly.

DVDK weather goes through Feb 27 1997. Then there are a couple spot missing data in the OK Mesonet data and one stretch from 1997-06-16 22:30:00 through 1997-06-17 23:00:00 that will be kept as missing.

In [None]:
print(f"Number of Equilibria Observations: {mweather.Ed.isna().sum()}")
# mweather.Ew.isna().sum() # same number

In [None]:
ind0 = np.where(~mweather.Ed.isna())[0][0] # First index of not-missing data
df2 = mweather.iloc[0:ind0]
t0 = df2.date.min()
t1 = df2.date.max()
print(f"Consecutive Half-Hour times for missing Eqs: {len(df2.date.diff().unique()) == 2}") # half-hour plus the NA at the begining of the time diff
print(f"Start time of missing Eq: {t0}")
print(f"End time of missing Eq: {t1}")

In [None]:
# Getting data from DVDK
df3 = dweather[(dweather.date>=t0) & (dweather.date<=t1)]

# Half-hour resolution, linear interp
df3_hh = (
    df3
    .set_index("date")
    .resample("30min")
    .interpolate(method="time")
    .reset_index()
)

print(df3.date.diff().unique()[1:])
print(df3_hh.date.diff().unique()[1:])

In [None]:
# Visualize the interp
tsteps=168
plt.plot(df3.date.iloc[0:168], df3.Ed.iloc[0:168], '-', label="Observed")
plt.xticks(rotation=45)
plt.plot(df3_hh.date.iloc[0:(168*2)], df3_hh.Ed.iloc[0:(168*2)], '--', label="Interpolated")
plt.legend()

In [None]:
mweather_filled = (
    mweather
    .set_index("date")
    .combine_first(df3_hh.set_index("date"))
    .reset_index()
)

t0 = df3_hh.date.min()
t1 = df3_hh.date.max()

In [None]:
print(mweather[(mweather.date>=t0) & (mweather.date<=t1)].Ed.isna().mean())
print(mweather_filled[(mweather_filled.date>=t0) & (mweather_filled.date<=t1)].Ed.isna().mean())

In [None]:
# Remove period in start of 1996 with missing data, fill spot missing
print(mweather_filled[mweather_filled.date < t0].Ed.isna().mean())

df_out = mweather_filled[mweather_filled.date >= t0]
print(df_out.Ed.isna().sum())

df_out[df_out.Ed.isna()].head()

In [None]:
# Interp Values
m = (df_out.Ed.loc[24028] - df_out.Ed.loc[24025])/3
b = df_out.Ed.loc[24025] - m
intp1a = m*2 + b
intp1b = m*3 + b
df_out.loc[24026:24027, 'Ed'] = [
    m*2 + b,
    m*3 + b
]


m = (df_out.Ew.loc[24028] - df_out.Ew.loc[24025])/3
b = df_out.Ew.loc[24025] - m
intp1a = m*2 + b
intp1b = m*3 + b
df_out.loc[24026:24027, 'Ew'] = [
    m*2 + b,
    m*3 + b
]



intp3 = (df_out.Ed.loc[26967 - 1] + df_out.Ed.loc[26967 + 1])/2
df_out.loc[26967, 'Ed'] = intp3
intp3 = (df_out.Ew.loc[26967 - 1] + df_out.Ew.loc[26967 + 1])/2
df_out.loc[26967, 'Ew'] = intp3

In [None]:
fmt = mdates.DateFormatter('%Y-%m-%d %H:%M')
fig, ax = plt.subplots(figsize=(10, 12), nrows=2, ncols=1)
locator = mdates.AutoDateLocator()

ax[0].plot(df_out.loc[24015:24035].date,df_out.loc[24015:24035].Ed, 'o-')
ax[0].plot(df_out.loc[24025:24028].date, df_out.loc[24025:24028].Ed, 'r-')
ax[0].plot(df_out.loc[24026:24027].date, df_out.loc[24026:24027, 'Ed'], 'ro-')


ax[1].plot(df_out.loc[26957:26977].date,df_out.loc[26957:26977].Ed, 'o-')
ax[1].plot(df_out.loc[26966:26968].date, df_out.loc[26966:26968].Ed, 'r-')
ax[1].plot(df_out.loc[26967].date, df_out.loc[26967, 'Ed'], 'ro-')



for a in ax:
    a.xaxis.set_major_locator(locator)
    a.xaxis.set_major_locator(mdates.AutoDateLocator())
    a.xaxis.set_major_formatter(fmt)
    a.tick_params(axis='x', rotation=45)
    a.grid()
    a.set_ylabel(r"Ed (%)")

fig.suptitle("Interpolated Hourly Drying Equilibrium", fontsize=16)
fig.tight_layout()

### Summarise Stretches of Missing Data

In [None]:
print(f"Missing Rain: {df_out.rain.isna().sum()}")
print(f"Missing Solar: {df_out.solar.isna().sum()}")
print(f"Missing Wind: {df_out.wind.isna().sum()}")
# Check consecutive half-hour, unique date diffs should be NA for first time and then half hour
print(f"    Time lags: {df_out[df_out.wind.isna()].date.diff().unique()[1:].astype(str)}")
print(f"Missing Ed: {df_out.Ed.isna().sum()}")
print(f"    Time lags: {df_out[df_out.Ed.isna()].date.diff().unique()[1:].astype(str)}")
print(f"Missing Ew: {df_out.Ew.isna().sum()}")
print(f"    Time lags: {df_out[df_out.Ew.isna()].date.diff().unique()[1:].astype(str)}")
print("~"*50)
print(f"Time Stretches with Missing Data - Can't use for Modeling")
print(f"    {df_out[df_out.Ed.isna()].date.min()} to {df_out[df_out.Ed.isna()].date.max()}")
print(f"    {df_out[df_out.wind.isna()].date.min()} to {df_out[df_out.wind.isna()].date.max()}")

### Write Out

In [None]:
df_out.to_excel("data/processed_data/weather_cleaned.xlsx")