In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Cross-Validation Setup 

The purpose of this notebook is to split the collected data into training and test splits.

To get reliable estimates of forecast error of a spatiotemporal model, care must be taken to avoid data leakage. See: https://github.com/jh-206/FRAMSC-2024---FMDA-Data-and-CV-Methods/blob/main/Spatiotemporal%20Cross%20Validation.ipynb

In [None]:
df = pd.read_pickle("data/rocky_2023_06-08.pkl")

In [None]:
from src.utils import make_st_map

In [None]:
x = make_st_map(df)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon

def make_st_map(df):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    buf = 2.5
    lons = (np.amin(df.lon), np.amax(df.lon))
    lats = (np.amin(df.lat), np.amax(df.lat))
    map = Basemap(projection='cyl', llcrnrlat=lats[0]-buf, urcrnrlat=lats[1]+buf,
                    llcrnrlon=lons[0]-buf,urcrnrlon=lons[1]+buf, resolution="i")
    map.drawcoastlines()
    map.drawcountries()
    map.drawstates()

    map.arcgisimage(service="World_Street_Map", xpixels=1000, verbose=False)
    
    x, y = map(df['lon'].values, df['lat'].values)
    map.scatter(x, y, marker='o', edgecolor='black', alpha=.3, linewidth=.7)
    
    # Add a rectangle representing the bounding box
    vertices = [map(lons[0], lats[1]), map(lons[1], lats[1]), 
                map(lons[1], lats[0]), map(lons[0], lats[0])]
    bounding_box = Polygon(vertices, edgecolor='r', alpha=.5, linewidth=1.5, facecolor='none', zorder=5)

    ax.add_patch(bounding_box)
    
    return 

In [None]:
make_st_map(df)

## Split Stations

One set of stations to be used to train models, one to predict.

In [None]:
sts = df.stid.unique()
train_stid, test_stid = train_test_split(sts, test_size=0.2, random_state=42)
print(f"Number of Unique RAWS Stations: {len(sts)}")
print(f"Number of Training Sites: {len(train_stid)}")
print(f"Number of Test Sites: {len(test_stid)}")

## Split Times