In [2]:
import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import holoviews as hv

import hvplot
import hvplot.pandas

In [3]:
hv.extension('bokeh')

In [18]:
fpath = Path("ts_datasets/storms_erikson.csv")

df = pd.read_csv(fpath, parse_dates=['time'])

df.head()

Unnamed: 0.1,Unnamed: 0,start,end,Hso(m),Hs(m),Dp(deg),Tp(s),SS(m),Storm_duration(days),Hindcast_or_projection,time
0,0,1981-08-29,1981-08-31 10:00:00,2.3,2.1,235,7.4,0.26,2.4,hindcast,1981-08-29 00:00:00
1,0,1981-08-29,1981-08-31 10:00:00,2.3,2.1,235,7.4,0.26,2.4,hindcast,1981-08-29 01:00:00
2,0,1981-08-29,1981-08-31 10:00:00,2.3,2.1,235,7.4,0.26,2.4,hindcast,1981-08-29 02:00:00
3,0,1981-08-29,1981-08-31 10:00:00,2.3,2.1,235,7.4,0.26,2.4,hindcast,1981-08-29 03:00:00
4,0,1981-08-29,1981-08-31 10:00:00,2.3,2.1,235,7.4,0.26,2.4,hindcast,1981-08-29 04:00:00


In [21]:
t_start = pd.to_datetime("2000-01-01")
t_end = pd.to_datetime("2002-01-01")

df_masked = df[df.time >= t_start][df.time < t_end]

  df_masked = df[df.time >= t_start][df.time < t_end]


In [22]:
H_plot = df.hvplot.scatter(x='time', y='Hs(m)')
S_plot = df.hvplot.scatter(x='time', y='SS(m)')

(H_plot * S_plot).opts(width=2000, height=500)

Let's numerically check correlations

In [27]:

corr_surge_waveheight = np.corrcoef(df['Hs(m)'].values, df['SS(m)'].values)
corr_surge_waveperiod = np.corrcoef(df['Tp(s)'].values, df['SS(m)'].values)
corr_surge_waveangle = np.corrcoef(df['Dp(deg)'].values, df['SS(m)'].values)

print('Wave height: \n', corr_surge_waveheight)
print('Wave period: \n', corr_surge_waveperiod)
print('Wave angle: \n', corr_surge_waveangle)


Wave height: 
 [[ 1.         -0.00160652]
 [-0.00160652  1.        ]]
Wave period: 
 [[ 1.        -0.2039953]
 [-0.2039953  1.       ]]
Wave angle: 
 [[1.         0.43173249]
 [0.43173249 1.        ]]


Perhaps we can construct a linear combination from wave period and wave angle?

In [55]:
# assume surge = a * wave period + b * wave angle + c

# Apply BLUE (Best Linear Unbiased Estimator)
# A @ X = E(Y)

A = np.column_stack((df['Tp(s)'].values, df['Dp(deg)'].values, np.ones(len(df))))

print(A.shape)
print(A)

Y = df['SS(m)'].values

SigmaY = np.cov(Y)

print(SigmaY)

X_hat = np.linalg.inv(A.T * SigmaY**-1 @ A) @ A.T * SigmaY**-1 @ Y

print(X_hat)

Y_hat = A @ X_hat

Y_hat[np.nonzero(Y_hat < 0)] = 0

(43234, 3)
[[  7.4 235.    1. ]
 [  7.4 235.    1. ]
 [  7.4 235.    1. ]
 ...
 [  8.3 255.    1. ]
 [  8.3 255.    1. ]
 [  8.3 255.    1. ]]
0.13153687343538695
[-0.03542346  0.0043698  -0.28750601]


In [56]:
Y_curve = df.hvplot.scatter(x='time', y='SS(m)', label='Y')
Y_hat_curve = hv.Scatter(zip(df.time.values, Y_hat), label='Y_hat')

Y_curve * Y_hat_curve