<a href="https://colab.research.google.com/github/jstephens/tideprediction/blob/main/Tide_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries


In [43]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
from bokeh.io import show
from bokeh.models import ColumnDataSource, Range1d
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import DatetimeTickFormatter, DatetimeAxis
from bokeh.models import HoverTool
from bokeh.models import FixedTicker
from bokeh.embed import components
from bokeh.embed import autoload_static
output_notebook()
import imageio
import matplotlib.pyplot as plt
from datetime import datetime
import xgboost as xgb
import datetime
import joblib
from sklearn.multioutput import MultiOutputRegressor
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from tensorflow.keras import backend as K
from keras_tuner import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import plot_model

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Load and Initial Transformation

In [2]:
X_train = np.load('drive/MyDrive/source/X_train_surge_new.npz')
X_test = np.load('drive/MyDrive/source/X_test_surge_new.npz')
Y_train = pd.read_csv('drive/MyDrive/source/Y_train_surge.csv',index_col = 'id_sequence')

df_X_test = pd.DataFrame.from_dict({item: X_test[item] for item in X_test.files}, orient='index').T.set_index('id_sequence')
df_X_train = pd.DataFrame.from_dict({item: X_train[item] for item in X_train.files}, orient='index').T.set_index('id_sequence')

In [3]:
def timestamp_to_datetime(timestamp):
    return pd.to_datetime('1970-01-01') + pd.to_timedelta(timestamp, unit='s')

df_X_train['t_slp'] = df_X_train['t_slp'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_train['t_surge1_input'] = df_X_train['t_surge1_input'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_train['t_surge2_input'] = df_X_train['t_surge2_input'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_train['t_surge1_output'] = df_X_train['t_surge1_output'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_train['t_surge2_output'] = df_X_train['t_surge2_output'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))

In [4]:
df_X_train = df_X_train.tail(500)
Y_train = Y_train.tail(500)

# Optional: Input EDA

In [None]:
df_X_train

Unnamed: 0_level_0,t_slp,slp,t_surge1_input,surge1_input,t_surge2_input,surge2_input,t_surge1_output,t_surge2_output
id_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4600,"[2002-03-27 12:00:00, 2002-03-27 15:00:16, 200...","[[[102326.13, 102259.13, 102183.13, 102098.13,...","[2002-03-27 00:00:00, 2002-03-27 12:59:44, 200...","[0.19478741, -0.0029626957, 0.2014908, 0.09088...","[2002-03-27 01:59:28, 2002-03-27 15:00:16, 200...","[-1.0178498, -0.5484413, -0.3640308, 0.6669915...","[2002-04-01 04:00:00, 2002-04-01 16:59:44, 200...","[2002-04-01 06:00:32, 2002-04-01 17:59:28, 200..."
4601,"[2002-03-28 12:00:00, 2002-03-28 15:00:16, 200...","[[[101650.03, 101598.03, 101572.03, 101555.03,...","[2002-03-28 00:59:44, 2002-03-28 14:00:32, 200...","[0.2014908, 0.09088481, -0.023072876, 0.134456...","[2002-03-28 03:00:16, 2002-03-28 15:00:16, 200...","[-0.3640308, 0.6669915, 0.029937062, 0.5831685...","[2002-04-02 04:59:44, 2002-04-02 16:59:44, 200...","[2002-04-02 07:00:16, 2002-04-02 19:00:16, 200..."
4602,"[2002-03-29 12:00:00, 2002-03-29 15:00:16, 200...","[[[101207.48, 101035.48, 100907.48, 100831.48,...","[2002-03-29 01:59:28, 2002-03-29 15:00:16, 200...","[-0.023072876, 0.13445687, 0.054016147, 0.1210...","[2002-03-29 04:00:00, 2002-03-29 16:00:00, 200...","[0.029937062, 0.58316857, 0.24787673, 0.138906...","[2002-04-03 04:59:44, 2002-04-03 16:59:44, 200...","[2002-04-03 07:00:16, 2002-04-03 20:00:00, 200..."
4603,"[2002-03-30 15:00:16, 2002-03-30 17:59:28, 200...","[[[102146.62, 102047.62, 101940.62, 101824.62,...","[2002-03-30 03:00:16, 2002-03-30 15:00:16, 200...","[0.054016147, 0.12105008, 0.04060936, 0.204842...","[2002-03-30 04:00:00, 2002-03-30 16:59:44, 200...","[0.24787673, 0.1389069, 0.3233174, 0.54963934,...","[2002-04-04 06:00:32, 2002-04-04 17:59:28, 200...","[2002-04-04 08:59:44, 2002-04-04 20:59:44, 200..."
4604,"[2002-03-31 15:00:16, 2002-03-31 17:59:28, 200...","[[[102549.04, 102471.04, 102387.04, 102296.04,...","[2002-03-31 04:00:00, 2002-03-31 16:00:00, 200...","[0.04060936, 0.2048425, 0.28863493, 0.2014908,...","[2002-03-31 04:59:44, 2002-03-31 16:59:44, 200...","[0.3233174, 0.54963934, 0.7172853, 0.4909633, ...","[2002-04-05 07:00:16, 2002-04-05 19:00:16, 200...","[2002-04-05 09:59:28, 2002-04-05 23:00:16, 200..."
...,...,...,...,...,...,...,...,...
5595,"[2010-10-19 08:59:44, 2010-10-19 11:58:56, 201...","[[[101470.0, 101486.0, 101501.0, 101517.0, 101...","[2010-10-18 23:00:16, 2010-10-19 10:59:12, 201...","[1.9376696, 1.6092033, 0.86177504, 1.2974956, ...","[2010-10-19 00:59:44, 2010-10-19 13:00:48, 201...","[0.021554768, 0.62508005, 0.39037576, 0.457434...","[2010-10-24 01:59:28, 2010-10-24 15:00:16, 201...","[2010-10-24 03:58:56, 2010-10-24 16:00:00, 201..."
5596,"[2010-10-20 08:59:44, 2010-10-20 12:01:04, 201...","[[[101663.87, 101650.87, 101641.87, 101635.87,...","[2010-10-20 00:00:00, 2010-10-20 12:01:04, 201...","[0.86177504, 1.2974956, 0.50984687, 0.87518185...","[2010-10-20 01:59:28, 2010-10-20 14:00:32, 201...","[0.39037576, 0.45743412, 0.28140593, 0.4322872...","[2010-10-25 01:59:28, 2010-10-25 15:00:16, 201...","[2010-10-25 04:01:04, 2010-10-25 16:00:00, 201..."
5597,"[2010-10-21 08:59:44, 2010-10-21 11:58:56, 201...","[[[101876.15, 101838.15, 101801.15, 101765.15,...","[2010-10-21 00:00:00, 2010-10-21 13:00:48, 201...","[0.50984687, 0.87518185, 1.8505255, 1.193593, ...","[2010-10-21 01:59:28, 2010-10-21 14:00:32, 201...","[0.28140593, 0.43228725, 0.15567149, 0.2478767...","[2010-10-26 02:59:12, 2010-10-26 16:00:00, 201...","[2010-10-26 05:00:48, 2010-10-26 16:59:44, 201..."
5598,"[2010-10-22 08:59:44, 2010-10-22 12:01:04, 201...","[[[102152.04, 102120.04, 102085.04, 102047.04,...","[2010-10-22 00:00:00, 2010-10-22 13:00:48, 201...","[1.8505255, 1.193593, 1.3980465, 1.7801399, 1....","[2010-10-22 02:59:12, 2010-10-22 15:00:16, 201...","[0.15567149, 0.24787673, 0.41552263, 0.5412571...","[2010-10-27 04:01:04, 2010-10-27 16:00:00, 201...","[2010-10-27 05:00:48, 2010-10-27 17:59:28, 201..."


In [None]:
Y_train

Unnamed: 0_level_0,surge1_t0,surge1_t1,surge1_t2,surge1_t3,surge1_t4,surge1_t5,surge1_t6,surge1_t7,surge1_t8,surge1_t9,surge2_t0,surge2_t1,surge2_t2,surge2_t3,surge2_t4,surge2_t5,surge2_t6,surge2_t7,surge2_t8,surge2_t9
id_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4600,0.288635,0.201491,0.214898,0.325504,-0.180603,-0.867700,-0.720226,-0.230878,-0.354891,-1.460951,0.717285,0.490963,0.985519,1.639338,1.396251,1.052577,1.102871,0.843020,1.681249,1.957865
4601,0.214898,0.325504,-0.180603,-0.867700,-0.720226,-0.230878,-0.354891,-1.460951,-0.750391,-0.351539,0.985519,1.639338,1.396251,1.052577,1.102871,0.843020,1.681249,1.957865,2.469185,2.083599
4602,-0.180603,-0.867700,-0.720226,-0.230878,-0.354891,-1.460951,-0.750391,-0.351539,-0.395111,-0.455442,1.396251,1.052577,1.102871,0.843020,1.681249,1.957865,2.469185,2.083599,1.874042,1.521986
4603,-0.720226,-0.230878,-0.354891,-1.460951,-0.750391,-0.351539,-0.395111,-0.455442,-0.137031,0.399241,1.102871,0.843020,1.681249,1.957865,2.469185,2.083599,1.874042,1.521986,1.262134,0.859784
4604,-0.354891,-1.460951,-0.750391,-0.351539,-0.395111,-0.455442,-0.137031,0.399241,0.278580,-0.462145,1.681249,1.957865,2.469185,2.083599,1.874042,1.521986,1.262134,0.859784,0.574786,0.625080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,1.451674,1.220407,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,0.759197,0.884931,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700
5596,0.556771,1.709754,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.298171,-0.397560,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223
5597,0.483033,0.533309,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,1.545521,-0.540059,-0.498148,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360
5598,1.448322,1.820360,1.662831,1.528763,1.284089,0.070775,0.888589,1.545521,1.166779,0.653970,-0.498148,-0.405942,0.164054,0.331700,1.052577,1.220223,1.446545,2.670360,2.620066,2.678742


# Optional: EDA Visualizations

These were used on a brief [writeup](https://jstephens.io/projects/tideprediction/) after exporting to JavaScript or .gif format.

## Timestamp Graph



In [None]:
def timestamp_graph(df_bokeh):
  rows = [100, 101, 102]
  data_sources = []

  for i, row in enumerate(rows):
      df_row = pd.DataFrame(df_bokeh.iloc[row]).T

      columns = ['t_slp', 't_surge1_input', 't_surge2_input', 't_surge1_output', 't_surge2_output']
      dfs = {}

      for column in columns:
          df = pd.DataFrame(df_row[column])
          df = df.explode(column).reset_index(drop=True)
          dfs[column] = df

      x = [15, 8, 1][i]
      x2 = x + 2
      x3 = x + 3

      source_row = ColumnDataSource(data=dict(
          x=pd.to_datetime(dfs['t_slp']['t_slp']),
          y=[x] * len(dfs['t_slp']),
          size=[10] * len(dfs['t_slp']),
          color=[colorvar] * len(dfs['t_slp'])
      ))
      source2_row = ColumnDataSource(data=dict(
          x=pd.to_datetime(dfs['t_surge1_input']['t_surge1_input']),
          y=[x2] * len(dfs['t_surge1_input']),
          size=[10] * len(dfs['t_surge1_input']),
          color=[colorvar] * len(dfs['t_surge1_input'])
      ))
      source3_row = ColumnDataSource(data=dict(
          x=pd.to_datetime(dfs['t_surge2_input']['t_surge2_input']),
          y=[x3] * len(dfs['t_surge2_input']),
          size=[10] * len(dfs['t_surge2_input']),
          color=[colorvar] * len(dfs['t_surge2_input'])
      ))
      source2_pred_row = ColumnDataSource(data=dict(
          x=pd.to_datetime(dfs['t_surge1_output']['t_surge1_output']),
          y=[x2] * len(dfs['t_surge1_output']),
          size=[10] * len(dfs['t_surge1_output']),
          color=['lightgray'] * len(dfs['t_surge1_output'])
      ))
      source3_pred_row = ColumnDataSource(data=dict(
          x=pd.to_datetime(dfs['t_surge2_output']['t_surge2_output']),
          y=[x3] * len(dfs['t_surge2_output']),
          size=[10] * len(dfs['t_surge2_output']),
          color=['lightgray'] * len(dfs['t_surge2_output'])
      ))

      data_sources.extend([source_row, source2_row, source3_row, source2_pred_row, source3_pred_row])


  p = figure(plot_width=800, plot_height=400, x_axis_label='Timestamp', y_axis_label='', toolbar_location=None, x_axis_type='datetime')

  y_ticks = [1,2.5,3,4,8,9.5,10,11,15,16.5,17,18]
  y_labels = ['Encoded Image Recorded','Row 102                                                               ','Surge Time, City 1','Surge Time, City 2',
              'Encoded Image Recorded','Row 101                                                               ','Surge Time, City 1','Surge Time, City 2',
              'Encoded Image Recorded','Row 100                                                               ','Surge Time, City 1','Surge Time, City 2']
  p.yaxis.ticker = FixedTicker(ticks=y_ticks)
  p.yaxis.major_label_overrides = {tick: label for tick, label in zip(y_ticks, y_labels)}

  p.ygrid.grid_line_color = 'white'

  sources = [
      source_row100, source2_row100, source3_row100, source2_pred_row100, source3_pred_row100,
      source_row101, source2_row101, source3_row101, source2_pred_row101, source3_pred_row101,
      source_row102, source2_row102, source3_row102, source2_pred_row102, source3_pred_row102
  ]

  for source in sources:
      p.circle('x', 'y', source=source, size='size', alpha=0.5,color='color')

  p.x_range=Range1d(start=pd.Timestamp('1950-11-06 12:30:00'), end=pd.Timestamp('1950-11-18 12:30:00'))

  p.y_range=Range1d(start=0, end=19)

  p.xaxis.major_tick_line_color = None
  p.xaxis.minor_tick_line_color = None
  p.yaxis.major_tick_line_color = None
  p.yaxis.minor_tick_line_color = None

  hover = HoverTool(
      tooltips=[
          ("Time", "@x{%F %T}")
      ],
      formatters={"@x": "datetime"},
      mode='mouse'
  )
  p.add_tools(hover)
  output_notebook()
  show(p)

In [None]:
df_bokeh = df_X_train.copy()
timestamp_graph(df_bokeh)

NameError: ignored

The plot above represents three typical rows for the X training dataset. Each row represents 5 days but will overlap with other rows temporally. 

In [None]:
script, div = components(p)
print(div)
print(script)

## Encoded Image GIF

In [13]:
def gif(df_X_train,column):
  # Retrieve the 40 images from the row of interest
  images = df_X_train[column][5599]

  # Create an empty list to store the image file names
  image_files = []

  # Loop over each image and save it as a file
  for i in range(len(images)):
      # Create the file name with a 4-digit index
      filename = f'image_{i:04d}.png'
      
      # Save the image as a file
      plt.imsave(filename, images[i], cmap='gray')
      
      # Add the file name to the list
      image_files.append(filename)

  # Use imageio to create the gif
  filename = 'x_train_'+column+'.gif'

  with imageio.get_writer(filename, mode='I') as writer:
      # Loop over each image file and add it to the gif
      for filename in image_files:
          image = imageio.imread(filename)
          writer.append_data(image)

In [14]:
gif(df_X_train,'slp')

  image = imageio.imread(filename)


# Timestamp Transformations



In [5]:
def get_nth_day(timestamps):
    first_timestamp = timestamps[0]
    year_start = datetime.datetime(first_timestamp.year, 1, 1)
    return int((first_timestamp - year_start).days + 1)

def get_last_date(timestamps):
    last_timestamp = timestamps[-1]
    year_start = datetime.datetime(last_timestamp.year, 1, 1)
    return int((last_timestamp - year_start).days + 1)

def get_first_year(timestamps):
    return timestamps[0].year

def extract_hour(timestamps):
    return np.array([pd.Timestamp(ts, unit='s').hour for ts in timestamps])

In [6]:
df_X_train['startingdate'] = df_X_train['t_slp'].apply(lambda x: get_nth_day(x))
df_X_train['lastdate'] = df_X_train['t_slp'].apply(lambda x: get_last_date(x))
df_X_train['year'] = df_X_train['t_slp'].apply(lambda x: get_first_year(x))
df_X_train['t_slp'] = df_X_train['t_slp'].apply(lambda x: extract_hour(x))
df_X_train['t_surge1_input'] = df_X_train['t_surge1_input'].apply(lambda x: extract_hour(x))
df_X_train['t_surge2_input'] = df_X_train['t_surge2_input'].apply(lambda x: extract_hour(x))
df_X_train['t_surge1_output'] = df_X_train['t_surge1_output'].apply(lambda x: extract_hour(x))
df_X_train['t_surge2_output'] = df_X_train['t_surge2_output'].apply(lambda x: extract_hour(x))

# Gradient Calculation

In [7]:
def calculate_gradients(row):
    image_gradient = np.zeros((40, 41, 41))
    for i in range(40):
        image_gradient[i] = np.gradient(row[i], axis=1)
    return image_gradient

# Apply the calculate_gradients function to each row of the 'slp' column
df_X_train['gradient'] = df_X_train['slp'].apply(calculate_gradients)

In [None]:
gif(df_X_train,'gradient')

# Dataset Prep

## Scaling Data for RNN Use Only

In [8]:
from sklearn.preprocessing import MinMaxScaler

# Define the scaling function
scaler = MinMaxScaler()

for i, row in df_X_train['slp'].items():
    # Scale each numpy array in the row
    scaled_array = np.array([scaler.fit_transform(arr) for arr in row])
    
    # Update the dataframe column with the scaled values
    df_X_train.at[i, 'slp'] = scaled_array

In [9]:
scaler = MinMaxScaler() 

for i, row in df_X_train['gradient'].items():
    # Scale each numpy array in the row
    scaled_array = np.array([scaler.fit_transform(arr) for arr in row])
    
    # Update the dataframe column with the scaled values
    df_X_train.at[i, 'gradient'] = scaled_array

In [10]:
df_X_train['slp'] = df_X_train['slp'].apply(lambda arr: np.concatenate(arr).ravel())
df_X_train['gradient'] = df_X_train['gradient'].apply(lambda arr: np.concatenate(arr).ravel())

In [11]:
def flatten_array(row):
    new_row = []
    for col in row:
        if isinstance(col, np.ndarray):
            new_row.extend(col.ravel())
        else:
            new_row.append(col)
    return pd.Series(new_row)

## Single Model Dataset Prep

In [12]:
dualdataset = False

X_train_flat = df_X_train[['gradient','slp','surge1_input','surge2_input']].apply(flatten_array, axis=1)

## Dual Model Dataset Prep

In [10]:
dualdataset = True

X_train1 = df_X_train.drop(['t_surge2_output','surge2_input','t_surge2_input'], axis=1).copy()
X_train1_flat = X_train1.apply(flatten_array, axis=1)
X1 = X_train1_flat.to_numpy()

X_train2 = df_X_train.drop(['t_surge1_output','surge1_input','t_surge1_input'], axis=1).copy()
X_train2_flat = X_train2.apply(flatten_array, axis=1)
X2 = X_train2_flat.to_numpy()

In [10]:
Y_train1 = Y_train.filter(regex='^surge1')
y1 = Y_train1.to_numpy()

Y_train2 = Y_train.filter(regex='^surge2')
y2 = Y_train2.to_numpy()

## X-Test Dataset Prep

All the data processing work done on the X-train dataset, repeated for X-test.

In [13]:
df_X_test['t_slp'] = df_X_test['t_slp'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_test['t_surge1_input'] = df_X_test['t_surge1_input'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_test['t_surge2_input'] = df_X_test['t_surge2_input'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_test['t_surge1_output'] = df_X_test['t_surge1_output'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))
df_X_test['t_surge2_output'] = df_X_test['t_surge2_output'].apply(lambda x: np.array([timestamp_to_datetime(t) for t in x]))

df_X_test['startingdate'] = df_X_test['t_slp'].apply(lambda x: get_nth_day(x))
df_X_test['lastdate'] = df_X_test['t_slp'].apply(lambda x: get_last_date(x))
df_X_test['year'] = df_X_test['t_slp'].apply(lambda x: get_first_year(x))
df_X_test['t_slp'] = df_X_test['t_slp'].apply(lambda x: extract_hour(x))
df_X_test['t_surge1_input'] = df_X_test['t_surge1_input'].apply(lambda x: extract_hour(x))
df_X_test['t_surge2_input'] = df_X_test['t_surge2_input'].apply(lambda x: extract_hour(x))
df_X_test['t_surge1_output'] = df_X_test['t_surge1_output'].apply(lambda x: extract_hour(x))
df_X_test['t_surge2_output'] = df_X_test['t_surge2_output'].apply(lambda x: extract_hour(x))

In [14]:
df_X_test['gradient'] = df_X_test['slp'].apply(calculate_gradients)

In [15]:
scaler = MinMaxScaler() 
for i, row in df_X_test['slp'].items():
    # Scale each numpy array in the row
    scaled_array = np.array([scaler.fit_transform(arr) for arr in row])
    
    # Update the dataframe column with the scaled values
    df_X_test.at[i, 'slp'] = scaled_array

scaler = MinMaxScaler() 
for i, row in df_X_test['gradient'].items():
    # Scale each numpy array in the row
    scaled_array = np.array([scaler.fit_transform(arr) for arr in row])
    
    # Update the dataframe column with the scaled values
    df_X_test.at[i, 'gradient'] = scaled_array

In [16]:
df_X_test['slp'] = df_X_test['slp'].apply(lambda arr: np.concatenate(arr).ravel())
df_X_test['gradient'] = df_X_test['gradient'].apply(lambda arr: np.concatenate(arr).ravel())

In [17]:
if dualdataset == True:
  X_test1 = df_X_test.drop(['t_surge2_output','surge2_input','t_surge2_input'], axis=1).copy()
  X_test1_flat = X_test1.apply(flatten_array, axis=1)
  X1_test = X_test1_flat.to_numpy()

  X_test2 = df_X_test.drop(['t_surge1_output','surge1_input','t_surge1_input'], axis=1).copy()
  X_test2_flat = X_test2.apply(flatten_array, axis=1)
  X2_test = X_test2_flat.to_numpy()


else:
  X_test_flat = df_X_test[['gradient','slp','surge1_input','surge2_input']].apply(flatten_array, axis=1)
  length = 20
  X_test_flat1 = X_test_flat.head(1)
  X_test_flat2 = X_test_flat.tail(1)

# RNN Model

In [None]:
Y_train1 = pd.DataFrame()
Y_train2 = pd.DataFrame()

for index, row in Y_train.iterrows():
    surge1_values = row.loc[row.index.str.startswith('surge1')].values
    surge2_values = row.loc[row.index.str.startswith('surge2')].values
    
    Y_train1 = Y_train1.append(pd.Series(surge1_values), ignore_index=True)
    Y_train2 = Y_train2.append(pd.Series(surge2_values), ignore_index=True)

In [28]:
pip install keras-tuner

In [63]:
from tensorflow.keras import optimizers

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_1', min_value=32, max_value=128, step=32),
                    activation='relu', input_shape=(134500,)))
    model.add(Dense(units=hp.Int('units_2', min_value=32, max_value=128, step=32),
                    activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dense(units=hp.Int('units_3', min_value=16, max_value=64, step=16),
                    activation='relu'))
    model.add(Dense(10, activation='linear', name='Output'))

    # Optimizer options
    optimizer_type = hp.Choice('optimizer', values=['adam', 'rmsprop', 'sgd'])
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Set the optimizer based on the selected type
    if optimizer_type == 'adam':
        optimizer = optimizers.Adam(learning_rate=hp_learning_rate)
    elif optimizer_type == 'rmsprop':
        optimizer = optimizers.RMSprop(learning_rate=hp_learning_rate)
    elif optimizer_type == 'sgd':
        optimizer = optimizers.SGD(learning_rate=hp_learning_rate)

    stop_early = EarlyStopping(monitor='val_loss', patience=5)

    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

In [61]:
def model_testing(original_df, y_dataset):
    X_train, X_test, y_train, y_test = train_test_split(original_df, y_dataset, test_size=0.2, random_state=42)
    tuner = RandomSearch(
        build_model,
        objective='val_loss',
        max_trials=2,
        executions_per_trial=1,
        directory='rnn_randomsearch',
        project_name='tideprediction'
    )
    tuner.search(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=16)
    results_summary = tuner.results_summary()
    print(results_summary)
    best_model = tuner.get_best_models(num_models=1)[0]
    return best_model

In [62]:
def train_and_predict(model, X_train, Y_train, X_test):
    model.fit(X_train, Y_train, epochs=20, batch_size=16)
    predictions = model.predict(X_test)
    predictions_df = pd.DataFrame(predictions)
    predictions_df.index = X_train.index
    predictions_df.columns = Y_train.columns
    return predictions_df

from keras_visualizer import visualizer

# Splitting the data and training/testing for location 1
location1_X_train, location1_X_test, location1_Y_train, location1_Y_test = train_test_split(X_train_flat, Y_train1, test_size=0.2, random_state=42)
location1_model = model_testing(location1_X_train, location1_Y_train)
visualizer(location1_model, file_format='png', view=True)

# Splitting the data and training/testing for location 2
location2_X_train, location2_X_test, location2_Y_train, location2_Y_test = train_test_split(X_train_flat, Y_train2, test_size=0.2, random_state=42)
location2_model = model_testing(location2_X_train, location2_Y_train)
plot_model(location2_model, to_file='location2_model.png',show_layer_activations=True,show_layer_names=True,show_shapes=True)


Results summary
Results in rnn_randomsearch/tideprediction
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 01 summary
Hyperparameters:
units_1: 128
units_2: 64
dropout_rate: 0.2
units_3: 48
learning_rate: 0.001
Score: 0.593422532081604

Trial 05 summary
Hyperparameters:
units_1: 128
units_2: 64
dropout_rate: 0.30000000000000004
units_3: 32
learning_rate: 0.001
Score: 0.6784546375274658

Trial 02 summary
Hyperparameters:
units_1: 128
units_2: 64
dropout_rate: 0.4
units_3: 16
learning_rate: 0.001
Score: 0.6883165836334229

Trial 07 summary
Hyperparameters:
units_1: 32
units_2: 128
dropout_rate: 0.30000000000000004
units_3: 48
learning_rate: 0.001
Score: 0.7232275605201721

Trial 09 summary
Hyperparameters:
units_1: 96
units_2: 96
dropout_rate: 0.4
units_3: 64
learning_rate: 0.001
Score: 0.7289102673530579

Trial 00 summary
Hyperparameters:
units_1: 32
units_2: 64
dropout_rate: 0.4
units_3: 32
learning_rate: 0.001
Score: 0.7333300709724426

Trial 03 summary
Hyper

NameError: ignored

In [None]:
def file_processing(combined_df):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    preds_file_path = f'output_{timestamp}.csv'
    combined_df.to_csv(preds_file_path)

# Combine the predictions into a single DataFrame
combined_df = pd.concat([location1_predictions, location2_predictions], axis=1)

# Rearrange the columns
combined_df = combined_df.reindex(columns=sorted(combined_df.columns))

# Save the combined predictions to a file
file_processing(combined_df)

# Model - RegressorChain

In [None]:
from sklearn.multioutput import RegressorChain

regressor1 = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0)
chained_regressor1 = RegressorChain(regressor1,random_state=42).fit(X1, y1)

regressor2 = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0)
chained_regressor2 = RegressorChain(regressor2,random_state=42).fit(X2, y2)

In [18]:
import joblib

# Define the filename for saving the model
filename = 'chained_regressor2_model.joblib'

# Save the model to the file
joblib.dump(chained_regressor2, filename)

['chained_regressor2_model.joblib']

In [None]:
predictions1 = chained_regressor1.predict(X1_test)

predictions2 = chained_regressor2.predict(X2_test)

In [None]:
predictions1_df = pd.DataFrame(predictions1)
predictions1_df.index = X_train1_flat.index
predictions1_df.columns = Y_train1.columns

predictions2_df = pd.DataFrame(predictions2)
predictions2_df.index = X_train2_flat.index
predictions2_df.columns = Y_train2.columns

In [None]:
columns1 = predictions1_df.columns
columns2 = predictions2_df.columns

# Combine the DataFrames with the desired column order
combined_df = pd.DataFrame()

for col1, col2 in zip(columns1, columns2):
    combined_df[col1] = predictions1_df[col1]
    combined_df[col2] = predictions2_df[col2]

In [26]:
def fileprocessing(combined_df):
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  preds_file_path = f'output_{timestamp}.csv'
  combined_df.to_csv(preds_file_path)

In [None]:
fileprocessing(combined_df)

# Model - MultiOutputRegressor


In [None]:
regressor = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0)
multioutput_regressor = MultiOutputRegressor(regressor).fit(X, y)

In [None]:
# Define the parameter grid for the grid search
param_grid = {
    'estimator__learning_rate': [0.1, 0.2],
    'estimator__max_depth': [3, 5],
    'estimator__n_estimators': [100, 200]
}

# Create the XGBoost regressor with GPU settings
regressor = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0)

# Create the MultiOutputRegressor with XGBoost regressor
multioutput_regressor = MultiOutputRegressor(regressor)

# Create the GridSearchCV object
grid_search = GridSearchCV(multioutput_regressor, param_grid, cv=3)

# Perform the grid search
grid_search.fit(X, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model_file_path = f'model_{timestamp}.pkl'

joblib.dump(multioutput_regressor, model_file_path)

In [None]:
predictions = multioutput_regressor.predict(X_test)

In [None]:
xgb_regressor = multioutput_regressor.regressor_

# Get model information
model_info = xgb_regressor.get_booster().get_dump()

# Print the model information
for tree_idx, tree_info in enumerate(model_info):
    print(f"Tree {tree_idx}:\n{tree_info}")

## Y Testing Processing and File Save

In [None]:
def fileprocessing(preds):
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
  preds_file_path = f'output_{timestamp}.csv'
  output_df = pd.DataFrame(preds)
  output_df.index = df_test_flat.index
  output_df.columns = Y_train.columns
  output_df.to_csv(preds_file_path)

In [None]:
fileprocessing(predictions)

# Performance Notes

Colab Pro is invaluable on a dataset of this size.


When using this implementation:

```
regressor = xgb.XGBRegressor(objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0)
multioutput_regressor = MultiOutputRegressor(regressor).fit(X, y)
```

**230 seconds** | When using 'gpu_hist' and 1000 row subset of data

**675 seconds** | When using 'gpu_hist' and entire data set


**Stopped at 7,402 seconds** | When using standard runtime and 1000 row subset  of data

**Session crashes** | When using standard runtime and entire data set


Parallel trees not used in this example but could potentially quicken the model training



