In [None]:
import pandas as pd
import numpy as np
import xgboost as xg
import tensorflow as tf
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import subprocess
import os
import os.path as osp

if not osp.exists("../data/co_202307-09.pkl"):
    print("Retrieving FMDA data")
    os.makedirs("data", exist_ok=True)
    subprocess.call("wget -P ../data https://demo.openwfm.org/web/data/fmda/dicts/co_202307-09.pkl", shell=True)
    assert osp.exists("../data/co_202307-09.pkl")    
    print("Downloaded https://demo.openwfm.org/web/data/fmda/dicts/co_202307-09.pkl as data/co_202307-09.pkl")

dat = pd.read_pickle("../data/co_202307-09.pkl")

In [None]:
# List of column names
column_names = ['STID', 'lat', 'lon', 'elevation', 'fm', 'Ed', 'Ew', 'wind', 'rain', 'time']

# Initialize DataFrame with column names and data types
df = pd.DataFrame(columns=column_names)

# Loop through Data Dictionary and append to dataframe
for k in dat:
    # Get FM from RAWS subdict
    fm = dat[k]["RAWS"]["fm"]
    # Get atmospheric data from HRRR subdict
    data = {key: dat[k]["HRRR"][key] for key in np.setdiff1d(column_names, ["STID", "elevation", "fm", "lat", "lon"])}
    df_temp = pd.DataFrame(data)
    # Ed = dat[k]["HRRR"]["Ed"]
    # Ew = dat[k]["HRRR"]["Ew"]
    # time = dat[k]["HRRR"]["time"]
    # Must repeat static vars to match dataframe length

    # NOTE: try to fix missing data issue
    if len(fm) == len(df_temp.Ed):
        stid = np.repeat(dat[k]["loc"]["STID"], len(fm))
        elev = np.repeat(dat[k]["loc"]["elev"], len(fm))
        lat = np.repeat(dat[k]["loc"]["lat"], len(fm))
        lon = np.repeat(dat[k]["loc"]["lon"], len(fm))
        df_temp["lat"] = lat
        df_temp["lon"] = lon
        df_temp["fm"] = fm
        df_temp["elevation"] = elev
        df_temp["STID"] = stid
        df_temp = df_temp.reindex(columns=column_names)
        
        # data_arrays = [stid, elev, fm, Ed, Ew, time]
        # dat_temp = dict(zip(column_names, data_arrays))
        # df_temp = pd.DataFrame(dat_temp)
        df = pd.concat([df, df_temp], ignore_index=True)

df = df.dropna()

In [None]:
# Add some time columns
df_copy = df.copy()
df_copy.loc[:, 'date_column'] = pd.to_datetime(df_copy['time'], utc=True)
df_copy.loc[:, 'day'] = df_copy['date_column'].dt.day
df_copy.loc[:, 'hour'] = df_copy['date_column'].dt.hour
# Add time t, structure is 1-T, where T is total number of hours 
min_time = df_copy['date_column'].min()
df_copy['t'] = (df_copy['date_column'] - min_time).dt.total_seconds() / 3600
df_copy['t'] = df_copy['t'].astype(int)

df = df_copy.copy()

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
# Create Data
X_train, X_test, y_train, y_test = train_test_split(df[["Ed", "Ew"]], df['fm'], test_size=.2)

In [None]:
# create model instance
bst = xg.XGBRegressor(n_estimators=2, max_depth=2, learning_rate=1, objective='reg:linear')
# fit model
bst.fit(X_train, y_train)

In [None]:
fitted = bst.predict(X_train)
preds = bst.predict(X_test)

In [None]:
# Calculate RMSE for the training data
rmse_train = np.sqrt(mean_squared_error(y_train, fitted))

# Calculate R-squared for the training data
r2_train = r2_score(y_train, fitted)

# Calculate RMSE for the test data
rmse_test = np.sqrt(mean_squared_error(y_test, preds))

# Calculate R-squared for the test data
r2_test = r2_score(y_test, preds)

print("RMSE for training data:", rmse_train)
print("R-squared for training data:", r2_train)
print("RMSE for test data:", rmse_test)
print("R-squared for test data:", r2_test)