In [None]:
!pip uninstall -y hopsworks
!pip install -U 'git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks&subdirectory=python' --quiet

In [9]:
import os
import urllib.request  
import re
from itertools import chain
import pandas as pd
import numpy as np
import hopsworks

In [None]:
# Uncomment this cell and fill in details if you are running your own Hopsworks cluster
key=""
with open("api-key.txt", "r") as f:
    key = f.read().rstrip()
os.environ['HOPSWORKS_PROJECT']="cjsurf"
os.environ['HOPSWORKS_HOST']="35.187.178.84"
os.environ['HOPSWORKS_API_KEY']=key    

In [10]:
BACKFILL=False
if os.environ.get('HOPSWORKS_BACKFILL') == "true":
    BACKFILL=True
hours=119
version=5
url="https://polar.ncep.noaa.gov/waves/WEB/gfswave.latest_run/plots/gfswave.62081.bull"
backfill_url="https://repo.hops.works/master/hopsworks-tutorials/data/cjsurf/swells-clean.csv"

In [11]:
secondary_columns=[]
for i in range(1,hours):
    secondary_columns.append("height" + str(i))
    secondary_columns.append("period" + str(i))
    secondary_columns.append("direction" + str(i))
    secondary_columns.append("hits_at" + str(i))

secondary_columns

['height1',
 'period1',
 'direction1',
 'hits_at1',
 'height2',
 'period2',
 'direction2',
 'hits_at2',
 'height3',
 'period3',
 'direction3',
 'hits_at3',
 'height4',
 'period4',
 'direction4',
 'hits_at4',
 'height5',
 'period5',
 'direction5',
 'hits_at5',
 'height6',
 'period6',
 'direction6',
 'hits_at6',
 'height7',
 'period7',
 'direction7',
 'hits_at7',
 'height8',
 'period8',
 'direction8',
 'hits_at8',
 'height9',
 'period9',
 'direction9',
 'hits_at9',
 'height10',
 'period10',
 'direction10',
 'hits_at10',
 'height11',
 'period11',
 'direction11',
 'hits_at11',
 'height12',
 'period12',
 'direction12',
 'hits_at12',
 'height13',
 'period13',
 'direction13',
 'hits_at13',
 'height14',
 'period14',
 'direction14',
 'hits_at14',
 'height15',
 'period15',
 'direction15',
 'hits_at15',
 'height16',
 'period16',
 'direction16',
 'hits_at16',
 'height17',
 'period17',
 'direction17',
 'hits_at17',
 'height18',
 'period18',
 'direction18',
 'hits_at18',
 'height19',
 'period19',
 '

In [12]:
def process_url(buoy_url):
    out = []
    for line in urllib.request.urlopen(buoy_url):
        l = line.decode('utf-8') #utf-8 or iso8859-1 or whatever the page encoding scheme is
        row=[]
        if "Cycle" in l:
            regex = re.findall(r'Cycle.*:\s+([0-9]+)\s+([0-9]+)\s+UTC.*', l)
            if len(regex):
                thedate=regex[0]
        else:
            res = re.match(r'.*[|]\s+([0-9]+)\s+([0-9]+)\s+[|].*', l)
            waves = re.findall(r'[|]\s+([0-9\.]+)\s+([0-9\.]+)\s+([0-9]+)\s+[|]', l)
            if res is not None:
                row.append(thedate)
                row.append(res.groups())
            if len(waves):
                if len(waves) > 3:
                    # print("found > 3 waves, reduce to 3")
                    waves = waves[:3]
                b = []
                list(b.extend(item) for item in waves)
                row.append(b)
                my = tuple(chain.from_iterable(row))
                out.append(my)
    return out, thedate

In [13]:
primary_columns=['pred_dtime', 'hour', 'pred_day', 'pred_hour', 'height1', 'period1', 'direction1', 'height2', 
         'period2', 'direction2', 'height3', 'period3', 'direction3'] 

if BACKFILL == True:
    df = pd.read_csv(backfill_url, parse_dates=['hits_at', 'pred_dtime'])
    num_rows = df.shape[0]
    print("num_rows: " + str(num_rows))
    rows = []
    for i in range(1, num_rows):
        row=[]
        for j in range(0, len(secondary_columns)):
            row.append("")
        rows.append(row)
    df_secondary = pd.DataFrame(rows, columns=secondary_columns)
    df = pd.concat([df, df_secondary],axis=1, join="outer")    
    
else: # BACKFILL == False
    res,thedate=process_url(url)
    df = pd.DataFrame(res, columns=primary_columns)
    df['pred_dtime'] = pd.to_datetime(df['pred_dtime'], format='%Y%m%d')
    df.insert(loc=0, column="hour_offset", value=df.reset_index().index)
    df['hour_offset'] = df.hour_offset.astype('timedelta64[h]')
    df['hits_at'] = df['pred_dtime'] + df['hour_offset'] + pd.Timedelta(hours=6)
    df['pred_dtime'] = df['pred_dtime'] + df.hour.astype('timedelta64[h]')


In [14]:
def is_valid_swell_direction(direction):
    if int(direction) > 180 or int(direction) < 20:
        return False
    return True

def best_height(row):
    best_secondary=2
    # Check which is best secondary swell - swell 2 or swell 3?
    if row['direction3'] != None:
        if is_valid_swell_direction(row['direction3']):
            if is_valid_swell_direction(row['direction2']) == False :
                best_secondary=3    
    best_direction = "direction" + str(best_secondary)
    best=1
    # Check which is best of swell 1 and secondary swell ?
    if row[best_direction] != None and is_valid_swell_direction(row[best_direction]) == True:
        if is_valid_swell_direction(row['direction1']) == False:
            best=best_secondary
                
    height = row['height' + str(best)]
    period = row['period' + str(best)]
    direction = row['direction' + str(best)]
        
    return pd.Series([height, period, direction])

if BACKFILL == False:
    df[['height','period','direction']]=df.apply(best_height, axis=1)
    df['beach_id'] = 1
    df.drop(['height1', 'period1', 'direction1', 'height2', 'period2', 'direction2', 'hour_offset',
              'height3', 'period3', 'direction3','hour', 'pred_day', 'pred_hour'], axis=1, inplace=True) 
    df['height'] = pd.to_numeric(df['height'] , errors='coerce')
    df['period'] = pd.to_numeric(df['period'] , errors='coerce')
    df['direction'] = pd.to_numeric(df['direction'] , errors='coerce')


In [15]:
matches = ["height", "period", "direction", "hits_at"]

if BACKFILL == False:
    entry = []
    data = []
    for index, row in df.iterrows():
        if (index==0):
            data.append(row['beach_id'])
            data.append(row['pred_dtime'])
        if (index < hours):
            for m in matches:
                data.append(row[m])

    entry.append(data)
    first_columns=['beach_id', 'pred_dtime', 'height', 'period', 'direction', 'hits_at']    
    all_columns = first_columns + secondary_columns
    df2 = pd.DataFrame(entry, columns=all_columns)
else:    
    df2=df

#df2.hits_at = df2.hits_at.values.astype(np.int64) // 10 ** 6
    
for i in range(1,hours):
    for j in matches:
        df2[j+str(i)] = pd.to_numeric(df2[j+str(i)])
df2

Unnamed: 0,beach_id,pred_dtime,height,period,direction,hits_at,height1,period1,direction1,hits_at1,...,direction116,hits_at116,height117,period117,direction117,hits_at117,height118,period118,direction118,hits_at118
0,1,2022-06-23,1.23,8.2,132,2022-06-23 06:00:00,1.24,8.2,129,1655967600000000000,...,66,1656381600000000000,2.54,8.6,68,1656385200000000000,2.53,8.7,70,1656388800000000000


In [16]:
project = hopsworks.login()
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://35.187.178.84:443/p/119
Connected. Call `.close()` to terminate connection gracefully.


In [17]:
swells_fg = fs.get_or_create_feature_group(name="swells_exploded",
                version=version,
                primary_key=["beach_id"],
                event_time="hits_at",
                description="Buoy surf height predictions",
                online_enabled=True,
                statistics_config={"enabled": True, "histograms": True, "correlations": True}
                )
swells_fg.insert(df2)
    

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://35.187.178.84/p/119/jobs/named/swells_exploded_5_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7f31d44cab50>, None)