In [1]:
import pandas as pd
import numpy as np

# Buoy Data Cleanup

In [31]:
dfs = []
for year in range(2000, 2005):
    df = pd.read_csv(f'../data/sgnw3h{year}.csv', comment='#', header=None)
    cols = ["YY","MM","DD","hh","WDIR","WSPD","GST","WVHT","DPD","APD","MWD","PRES","ATMP","WTMP","DEWP","VIS","TIDE"]
    df.columns = cols

    dfs.append(df)

first_df = pd.concat(dfs)
first_df['mm'] = [0] * len(first_df)
first_df.head()

dfs = []
for year in range(2005, 2025):
    df = pd.read_csv(f'../data/sgnw3h{year}.csv', comment='#', header=None)
    cols = ["YY","MM","DD","hh","mm","WDIR","WSPD","GST","WVHT","DPD","APD","MWD","PRES","ATMP","WTMP","DEWP","VIS","TIDE"]
    df.columns = cols
    dfs.append(df)

second_df = pd.concat(dfs)
df = pd.concat([first_df, second_df])

df = df.rename(columns={
    'YY': 'year',
    'MM': 'month',
    'DD': 'day',
    'hh': 'hour'
})

df['day'] = pd.to_datetime(df[['year','month','day']])

df = df[['day', 'WDIR', 'WSPD', 'GST', 'PRES', 'ATMP', 'WTMP', 'DEWP']]
df = df.set_index('day')

df = df[['WDIR','WSPD','GST','PRES','ATMP','WTMP','DEWP']].resample('D').mean(numeric_only=True)

df.head()

Unnamed: 0_level_0,WDIR,WSPD,GST,PRES,ATMP,WTMP,DEWP
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,181.333333,5.445833,6.495833,1013.741667,2.475,1.195833,-1.179167
2000-01-02,159.833333,5.579167,6.375,1005.933333,4.058333,1.425,3.883333
2000-01-03,117.208333,9.258333,10.554167,1014.7375,0.425,0.983333,-0.158333
2000-01-04,240.083333,3.670833,4.520833,1008.258333,-2.341667,0.3625,-3.658333
2000-01-05,257.0,4.929167,5.9,1023.720833,-6.066667,0.145833,-9.558333


In [32]:
df.to_csv('../data/sheb_buoy.csv')

# Weather Station Cleanup

In [33]:
dfs = []
for year in [2000, 2005, 2010, 2015, 2020]:
    filepath = f"../data/sheb_{year}-{year+4}.csv"
    df = pd.read_csv(filepath)
    dfs.append(df)


df = pd.concat(dfs)
df['day'] = pd.to_datetime(df['day'])
df = df.set_index('day')


df['doy'] = df.index.dayofyear

df['precip_in'] = df['precip_in'].fillna(0.)
df.loc[df['precip_in'] <= 0.001, 'precip_in'] = 0

df['rain_today'] = (df['precip_in'] != 0).astype(int)
df['rain_yesterday'] = (df['precip_in'].shift(1) != 0).astype(int)

for col in ['max_temp_f', 'min_temp_f', 'max_dewpoint_f', 'min_dewpoint_f', 'precip_in', 'avg_wind_speed_kts','avg_wind_drct', 'min_rh', 'avg_rh', 'max_rh', 'max_wind_speed_kts','max_wind_gust_kts']:
    df[col] = df[col].interpolate(method='linear')

df.drop(columns=['snow_in', 'srad_mj', 'max_wind_gust_kts'], inplace=True)

In [34]:
df.to_csv('../data/sheb_weather_station.csv')