In [1]:
!pip uninstall -y hopsworks
    !pip install -U 'git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks&subdirectory=python' --quiet

Found existing installation: hopsworks 2.6.0.dev1
Uninstalling hopsworks-2.6.0.dev1:
  Successfully uninstalled hopsworks-2.6.0.dev1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.1.5 requires pyqt5<5.13, which is not installed.
spyder 5.1.5 requires pyqtwebengine<5.13, which is not installed.
spyder 5.1.5 requires pylint<2.10.0,>=2.5.0, but you have pylint 2.13.4 which is incompatible.[0m


In [2]:
import os
import urllib.request  
import re
from itertools import chain
import pandas as pd
import numpy as np
import hopsworks
from datetime import timedelta

In [None]:
# Uncomment this cell and fill in details if you are running your own Hopsworks cluster
#key=""
#with open("api-key.txt", "r") as f:
#    key = f.read().rstrip()
#os.environ['HOPSWORKS_PROJECT']="cjsurf"
#os.environ['HOPSWORKS_HOST']="35.187.178.84"
#os.environ['HOPSWORKS_API_KEY']=key    

In [3]:
BACKFILL=False
if os.environ.get('HOPSWORKS_BACKFILL') == "True":
    BACKFILL=True
hours=119
version=10
url="https://polar.ncep.noaa.gov/waves/WEB/gfswave.latest_run/plots/gfswave.62081.bull"
backfill_url="https://repo.hops.works/master/hopsworks-tutorials/data/cjsurf/swells-clean.csv"

In [4]:
secondary_columns=[]
for i in range(1,hours):
    j=i*2
    secondary_columns.append("height" + str(j))
    secondary_columns.append("period" + str(j))
    secondary_columns.append("direction" + str(j))
    secondary_columns.append("hits_at" + str(j))

secondary_columns

['height2',
 'period2',
 'direction2',
 'hits_at2',
 'height4',
 'period4',
 'direction4',
 'hits_at4',
 'height6',
 'period6',
 'direction6',
 'hits_at6',
 'height8',
 'period8',
 'direction8',
 'hits_at8',
 'height10',
 'period10',
 'direction10',
 'hits_at10',
 'height12',
 'period12',
 'direction12',
 'hits_at12',
 'height14',
 'period14',
 'direction14',
 'hits_at14',
 'height16',
 'period16',
 'direction16',
 'hits_at16',
 'height18',
 'period18',
 'direction18',
 'hits_at18',
 'height20',
 'period20',
 'direction20',
 'hits_at20',
 'height22',
 'period22',
 'direction22',
 'hits_at22',
 'height24',
 'period24',
 'direction24',
 'hits_at24',
 'height26',
 'period26',
 'direction26',
 'hits_at26',
 'height28',
 'period28',
 'direction28',
 'hits_at28',
 'height30',
 'period30',
 'direction30',
 'hits_at30',
 'height32',
 'period32',
 'direction32',
 'hits_at32',
 'height34',
 'period34',
 'direction34',
 'hits_at34',
 'height36',
 'period36',
 'direction36',
 'hits_at36',
 'height

In [5]:
def process_url(buoy_url):
    out = []
    for line in urllib.request.urlopen(buoy_url):
        l = line.decode('utf-8') #utf-8 or iso8859-1 or whatever the page encoding scheme is
        row=[]
        if "Cycle" in l:
            regex = re.findall(r'Cycle.*:\s+([0-9]+)\s+([0-9]+)\s+UTC.*', l)
            if len(regex):
                thedate=regex[0]
        else:
            res = re.match(r'.*[|]\s+([0-9]+)\s+([0-9]+)\s+[|].*', l)
            waves = re.findall(r'[|]\s+([0-9\.]+)\s+([0-9\.]+)\s+([0-9]+)\s+[|]', l)
            if res is not None:
                row.append(thedate)
                row.append(res.groups())
            if len(waves):
                if len(waves) > 3:
                    # print("found > 3 waves, reduce to 3")
                    waves = waves[:3]
                b = []
                list(b.extend(item) for item in waves)
                row.append(b)
                my = tuple(chain.from_iterable(row))
                out.append(my)
    return out, thedate

In [6]:
primary_columns=['pred_dtime', 'hour', 'pred_day', 'pred_hour', 'height1', 'period1', 'direction1', 'height2', 
         'period2', 'direction2', 'height3', 'period3', 'direction3'] 

def is_valid_swell_direction(direction):
    if int(direction) > 180 or int(direction) < 20:
        return False
    return True

def best_height(row):
    best_secondary=2
    # Check which is best secondary swell - swell 2 or swell 3?
    if row['direction3'] != None:
        if is_valid_swell_direction(row['direction3']):
            if is_valid_swell_direction(row['direction2']) == False :
                best_secondary=3    
    best_direction = "direction" + str(best_secondary)
    best=1
    # Check which is best of swell 1 and secondary swell ?
    if row[best_direction] != None and is_valid_swell_direction(row[best_direction]) == True:
        if is_valid_swell_direction(row['direction1']) == False:
            best=best_secondary
                
    height = row['height' + str(best)]
    period = row['period' + str(best)]
    direction = row['direction' + str(best)]
        
    return pd.Series([height, period, direction])

# feature engineering - estimate the time at which the swell arrives at Lahinch from buoy
def estimate_hits_at(row):
    # baseline estimate
    hits_at = row['pred_dtime'] + row['hour_offset'] + timedelta(hours=8) 
    
    if float(row['direction']) < 80 and float(row['direction']) > 66:
        hits_at = hits_at - timedelta(hours=1)
    if float(row['direction']) <= 66 and float(row['direction']) > 50:
        hits_at = hits_at - timedelta(hours=2)
    if float(row['direction']) <= 50 and float(row['direction']) > 20:
        hits_at = hits_at - timedelta(hours=3)
    if float(row['period']) > 12:
        hits_at = hits_at - timedelta(hours=1)
    
    return pd.Series([hits_at])
    

if BACKFILL == True:
    df = pd.read_csv(backfill_url, parse_dates=['hits_at', 'pred_dtime'])
    num_rows = df.shape[0]
    print("num_rows: " + str(num_rows))
    rows = []
    for i in range(1, num_rows):
        row=[]
        for j in range(0, len(secondary_columns)):
            row.append("")
        if i % 2 == 0:
            rows.append(row)
    df_secondary = pd.DataFrame(rows, columns=secondary_columns)
    df = pd.concat([df, df_secondary],axis=1, join="outer")    
    
else: # BACKFILL == False
    res,thedate=process_url(url)
    df = pd.DataFrame(res, columns=primary_columns)
    df['pred_dtime'] = pd.to_datetime(df['pred_dtime'], format='%Y%m%d')
    df.insert(loc=0, column="hour_offset", value=(df.reset_index().index*2))
    df['hour_offset'] = df.hour_offset.astype('timedelta64[h]')
    df['pred_dtime'] = df['pred_dtime'] + df.hour.astype('timedelta64[h]')


In [7]:
if BACKFILL == False:
    df[['height','period','direction']]=df.apply(best_height, axis=1)
    df[['hits_at']]=df.apply(estimate_hits_at, axis=1)
    df['beach_id'] = 1
    df.drop(['height1', 'period1', 'direction1', 'height2', 'period2', 'direction2', 'hour_offset',
              'height3', 'period3', 'direction3','hour', 'pred_day', 'pred_hour'], axis=1, inplace=True) 
    df['height'] = pd.to_numeric(df['height'] , errors='coerce')
    df['period'] = pd.to_numeric(df['period'] , errors='coerce')
    df['direction'] = pd.to_numeric(df['direction'] , errors='coerce')


In [8]:
matches = ["height", "period", "direction", "hits_at"]

if BACKFILL == False:
    entry = []
    data = []
    for index, row in df.iterrows():
        if (index==0):
            data.append(row['beach_id'])
            data.append(row['pred_dtime'])
        if (index < hours):
            for m in matches:
                data.append(row[m])

    entry.append(data)
    first_columns=['beach_id', 'pred_dtime', 'height', 'period', 'direction', 'hits_at']    
    all_columns = first_columns + secondary_columns
    df2 = pd.DataFrame(entry, columns=all_columns)
else:    
    df2=df

for i in range(1,hours):
    for j in matches:
#         if j.startswith("hits_at") and BACKFILL==False:
#             df2[j+str(i*2)] = pd.to_datetime(df2[j+str(i*2)])    
#         else:
      df2[j+str(i*2)] = pd.to_numeric(df2[j+str(i*2)])
df2

Unnamed: 0,beach_id,pred_dtime,height,period,direction,hits_at,height2,period2,direction2,hits_at2,...,direction232,hits_at232,height234,period234,direction234,hits_at234,height236,period236,direction236,hits_at236
0,1,2022-07-04 06:00:00,0.97,8.4,145,2022-07-04 14:00:00,0.95,8.3,144,1656950400000000000,...,97,1657778400000000000,0.88,9.1,100,1657785600000000000,0.86,9.1,100,1657792800000000000


In [9]:
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/135
Connected. Call `.close()` to terminate connection gracefully.


In [10]:
swells_fg = fs.get_or_create_feature_group(name="swells_exploded",
                version=version,
                primary_key=["beach_id"],
                event_time="hits_at",
                description="Buoy surf height predictions",
                online_enabled=True,
                statistics_config={"enabled": True, "histograms": True, "correlations": True}
                )
swells_fg.insert(df2)
    

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/135/jobs/named/swells_exploded_10_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7fe257e29ca0>, None)