<a href="https://colab.research.google.com/github/MathewBiddle/ioos_metrics/blob/yearly_ra_gts/notebooks/IOOS_obs_2_NDBC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install erddapy

Collecting erddapy
  Downloading erddapy-2.2.4-py3-none-any.whl.metadata (6.7 kB)
Downloading erddapy-2.2.4-py3-none-any.whl (24 kB)
Installing collected packages: erddapy
Successfully installed erddapy-2.2.4


In [2]:
import pandas as pd
from erddapy import ERDDAP

# Function to grab the data.

In [3]:
def get_ndbc_full_stats():
    e = ERDDAP(
        server="https://erddap.ioos.us/erddap",
        protocol="tabledap",
    )
    e.response = "csv"
    dsets = {"IOOS": "gts_regional_statistics",
             "NDBC": "gts_ndbc_statistics",
             "non-NDBC": "gts_non_ndbc_statistics"}
    df_out = pd.DataFrame()
    for key, value in dsets.items():
        e.dataset_id = value
        df = e.to_pandas(
            index_col="time (UTC)",
            parse_dates=True
        )
        df["source"] = key
        df_out = pd.concat([df_out,df])
    return df_out

# Go get the data and return the dataframe

In [4]:
df = get_ndbc_full_stats()

df['total'] = df['met'] + df['wave']

df

Unnamed: 0_level_0,Year,Month,locationID,region,sponsor,met,wave,source,total
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:00:00+00:00,2018,1,46108,AOOS,ALASKA OCEAN OBSERVING SYSTEM,0,2592,IOOS,2592
2018-01-01 00:00:00+00:00,2018,1,AJXA2,AOOS,MARINE EXCHANGE OF ALASKA,8796,0,IOOS,8796
2018-01-01 00:00:00+00:00,2018,1,CDXA2,AOOS,MARINE EXCHANGE OF ALASKA,4782,0,IOOS,4782
2018-01-01 00:00:00+00:00,2018,1,ERXA2,AOOS,MARINE EXCHANGE OF ALASKA,5634,0,IOOS,5634
2018-01-01 00:00:00+00:00,2018,1,GIXA2,AOOS,MARINE EXCHANGE OF ALASKA,8798,0,IOOS,8798
...,...,...,...,...,...,...,...,...,...
2025-04-01 00:00:00+00:00,2025,4,OCSM2,,U.S. ARMY CORPS OF ENGINEERS,0,0,non-NDBC,0
2025-04-01 00:00:00+00:00,2025,4,44097,,U.S. ARMY CORPS OF ENGINEERS,0,2878,non-NDBC,2878
2025-04-01 00:00:00+00:00,2025,4,FRFN7,,U.S. ARMY CORPS OF ENGINEERS,0,0,non-NDBC,0
2025-04-01 00:00:00+00:00,2025,4,44100,,U.S. ARMY CORPS OF ENGINEERS,0,2874,non-NDBC,2874


# Compute yearly totals of number of IOOS messages sent to the GTS.

In [5]:
df.groupby(by=["source"])[['met','wave','total']].sum()

Unnamed: 0_level_0,met,wave,total
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IOOS,83046854,11890510,94937364
NDBC,59173500,13570724,72744224
non-NDBC,375902618,19371792,395274410


In [6]:
df.index.min().strftime('%Y')

'2018'

In [7]:
first_year = df.index.min().strftime('%Y')

ioos_total = df.loc[df['source']=='IOOS','total'].sum()

ioos_pcnt = ioos_total / df['total'].sum()

total = df['total'].sum()

print(f'Since {first_year} IOOS has contributed {ioos_total} messages to the GTS, which composes {(ioos_pcnt)*100:.2f} % of the total messages ({total}) reported by NDBC to the GTS')

Since 2018 IOOS has contributed 94937364 messages to the GTS, which composes 16.86 % of the total messages (562955998) reported by NDBC to the GTS


In [8]:
yearly_totals = df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum()

yearly_totals['total'] = yearly_totals['met'] + yearly_totals['wave']

print(yearly_totals.loc['IOOS'].to_markdown(floatfmt=''))

| time (UTC)                |        met |      wave |      total |
|:--------------------------|-----------:|----------:|-----------:|
| 2018-12-31 00:00:00+00:00 | 10250188.0 | 1586648.0 | 11836836.0 |
| 2019-12-31 00:00:00+00:00 | 11344260.0 | 1682754.0 | 13027014.0 |
| 2020-12-31 00:00:00+00:00 | 10501536.0 | 1410472.0 | 11912008.0 |
| 2021-12-31 00:00:00+00:00 | 10189996.0 | 1610856.0 | 11800852.0 |
| 2022-12-31 00:00:00+00:00 | 11295426.0 | 1616542.0 | 12911968.0 |
| 2023-12-31 00:00:00+00:00 | 12719038.0 | 1796258.0 | 14515296.0 |
| 2024-12-31 00:00:00+00:00 | 12794686.0 | 1726010.0 | 14520696.0 |
| 2025-12-31 00:00:00+00:00 |  3951724.0 |  460970.0 |  4412694.0 |


In [9]:
ioos_year = yearly_totals.loc['IOOS','2024']['total'].iloc[0]

ioos_year

np.int64(14520696)

In [10]:
year = '2024'

all_year = yearly_totals.loc['NDBC',year]['total'] + yearly_totals.loc['non-NDBC',year]['total'] + yearly_totals.loc['IOOS',year]['total']

ioos_year = yearly_totals.loc['IOOS','2024']['total'].iloc[0]

pcnt_year = ioos_year / all_year.iloc[0]

platforms = len(df.loc[(df.index >= '2024-01-01') & (df.index <= '2024-12-31') & (df['source']=='IOOS')].locationID.unique())

print(f'In {year}, the IOOS regions have submitted {ioos_year} messages to the GTS (from {platforms} unique platforms). Which comprises {pcnt_year*100:.2f} % of the total messages sent to the GTS via NDBC.')

In 2024, the IOOS regions have submitted 14520696 messages to the GTS (from 229 unique platforms). Which comprises 17.20 % of the total messages sent to the GTS via NDBC.


# Compute yearly totals of IOOS messages sent to the GTS by region.

In [11]:
yearly_region_totals = df.groupby(by=["source", "region", pd.Grouper(freq="YE")])[['met','wave']].sum()

yearly_region_totals['total'] = yearly_region_totals['met'] + yearly_region_totals['wave']

yearly_region_totals.reset_index('region',inplace=True)

print(yearly_region_totals.loc['IOOS'].to_markdown(floatfmt=''))

| time (UTC)                | region   |     met |   wave |   total |
|:--------------------------|:---------|--------:|-------:|--------:|
| 2018-12-31 00:00:00+00:00 | AOOS     | 2571070 |  27968 | 2599038 |
| 2019-12-31 00:00:00+00:00 | AOOS     | 2810422 |  31584 | 2842006 |
| 2020-12-31 00:00:00+00:00 | AOOS     | 2697606 |   8258 | 2705864 |
| 2021-12-31 00:00:00+00:00 | AOOS     | 2574888 |  24336 | 2599224 |
| 2022-12-31 00:00:00+00:00 | AOOS     | 3675118 |  37414 | 3712532 |
| 2023-12-31 00:00:00+00:00 | AOOS     | 5620044 |  34026 | 5654070 |
| 2024-12-31 00:00:00+00:00 | AOOS     | 5939434 |  31992 | 5971426 |
| 2025-12-31 00:00:00+00:00 | AOOS     | 2044136 |  10966 | 2055102 |
| 2018-12-31 00:00:00+00:00 | CARICOOS |  619422 |  68520 |  687942 |
| 2019-12-31 00:00:00+00:00 | CARICOOS |  719876 |  83582 |  803458 |
| 2020-12-31 00:00:00+00:00 | CARICOOS |  629990 |  93232 |  723222 |
| 2021-12-31 00:00:00+00:00 | CARICOOS |  676958 |  99074 |  776032 |
| 2022-12-31 00:00:0

# Calculate the percentage of IOOS OBS per year

In [12]:
yearly_totals = df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum()

yearly_totals['total'] = yearly_totals['met'] + yearly_totals['wave']

pcnt_obs_year = yearly_totals.loc['IOOS'] / (yearly_totals.loc['IOOS'] + yearly_totals.loc['NDBC'] + yearly_totals.loc['non-NDBC'])

# pcnt_obs_year['total'] =

print(pcnt_obs_year.to_markdown())

| time (UTC)                |      met |     wave |    total |
|:--------------------------|---------:|---------:|---------:|
| 2018-12-31 00:00:00+00:00 | 0.155619 | 0.296562 | 0.166207 |
| 2019-12-31 00:00:00+00:00 | 0.167812 | 0.312697 | 0.178495 |
| 2020-12-31 00:00:00+00:00 | 0.154613 | 0.277295 | 0.16316  |
| 2021-12-31 00:00:00+00:00 | 0.149813 | 0.3107   | 0.161207 |
| 2022-12-31 00:00:00+00:00 | 0.153994 | 0.278716 | 0.163134 |
| 2023-12-31 00:00:00+00:00 | 0.167084 | 0.222074 | 0.172366 |
| 2024-12-31 00:00:00+00:00 | 0.167279 | 0.217845 | 0.172026 |
| 2025-12-31 00:00:00+00:00 | 0.173669 | 0.228349 | 0.178125 |


In [13]:
yearly_totals.loc['IOOS'] + yearly_totals.loc['NDBC'] + yearly_totals.loc['non-NDBC']

# totals['total'] = totals['met']+totals['wave']

# totals['total'].sum()

#totals

Unnamed: 0_level_0,met,wave,total
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-12-31 00:00:00+00:00,65867386,5350148,71217534
2019-12-31 00:00:00+00:00,67601032,5381422,72982454
2020-12-31 00:00:00+00:00,67921642,5086546,73008188
2021-12-31 00:00:00+00:00,68018280,5184606,73202886
2022-12-31 00:00:00+00:00,73349742,5799952,79149694
2023-12-31 00:00:00+00:00,76123506,8088546,84212052
2024-12-31 00:00:00+00:00,76487008,7923096,84410104
2025-12-31 00:00:00+00:00,22754376,2018710,24773086


In [14]:
yearly_totals.loc['IOOS','total'].sum()

np.int64(94937364)

# Total number of unique platforms submitting data to the GTS from the IOOS regions

IOOS location IDs

In [15]:
df.loc[df['source']=='IOOS'].locationID

Unnamed: 0_level_0,locationID
time (UTC),Unnamed: 1_level_1
2018-01-01 00:00:00+00:00,46108
2018-01-01 00:00:00+00:00,AJXA2
2018-01-01 00:00:00+00:00,CDXA2
2018-01-01 00:00:00+00:00,ERXA2
2018-01-01 00:00:00+00:00,GIXA2
...,...
2025-04-01 00:00:00+00:00,SIPF1
2025-04-01 00:00:00+00:00,42098
2025-04-01 00:00:00+00:00,44095
2025-04-01 00:00:00+00:00,44086


In [16]:
n_ioos_stn = len(df.loc[df['source']=='IOOS'].locationID.unique())

All location ID's

In [17]:
n_all_stn = len(df.locationID.unique())

n_all_stn

1039

IOOS stations represent this of the total # stations whose data are delivered by NDBC to the GTS

In [18]:
station_pcnt = n_ioos_stn / n_all_stn

print(f'Since {first_year} IOOS regions have sent {n_ioos_stn} unique stations\' data to the GTS through NDBC. This comprises nearly {station_pcnt*100:.2f}% of all the stations reported ({n_all_stn}) and more than one third of the NOS stations')

Since 2018 IOOS regions have sent 260 unique stations' data to the GTS through NDBC. This comprises nearly 25.02% of all the stations reported (1039) and more than one third of the NOS stations


Focusing on station platform counts (260, I believe) - Can you compute what % that is of the total delivered by NDBC to the GTS? (I didn't notice if you identify the total # stations anywhere).

So, we can say "IOOS stations represent x% of the total # stations whose data are delivered by NDBC to the GTS"

In [19]:
ioos_ndbc_non_NDBC = yearly_totals.loc['IOOS'] + yearly_totals.loc['NDBC'] + yearly_totals.loc['non-NDBC']

yearly_totals.loc['IOOS','total'].sum() / ioos_ndbc_non_NDBC['total'].sum()

np.float64(0.1686408251040608)

# Total number of sponsors submitting data to the GTS via IOOS Regions

In [20]:
df.loc[df['source']=='IOOS'].sponsor

Unnamed: 0_level_0,sponsor
time (UTC),Unnamed: 1_level_1
2018-01-01 00:00:00+00:00,ALASKA OCEAN OBSERVING SYSTEM
2018-01-01 00:00:00+00:00,MARINE EXCHANGE OF ALASKA
2018-01-01 00:00:00+00:00,MARINE EXCHANGE OF ALASKA
2018-01-01 00:00:00+00:00,MARINE EXCHANGE OF ALASKA
2018-01-01 00:00:00+00:00,MARINE EXCHANGE OF ALASKA
...,...
2025-04-01 00:00:00+00:00,FLORIDA INSTITUTE OF TECHNOLOGY
2025-04-01 00:00:00+00:00,GREATER TAMPA BAY MARINE ADVISORY COUNCIL PORTS
2025-04-01 00:00:00+00:00,UNIVERSITY OF NORTH CAROLINA COASTAL STUDIES
2025-04-01 00:00:00+00:00,UNIVERSITY OF NORTH CAROLINA COASTAL STUDIES


In [21]:
len(df.loc[df['source']=='IOOS'].sponsor.unique())

47

# Average number of messages (met+wave) per year

For all the years, including 2025

In [22]:
print(df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum().loc['IOOS'].mean().to_markdown(floatfmt=''))
print(df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum().loc['IOOS'].mean().sum())

|      |           0 |
|:-----|------------:|
| met  | 10380856.75 |
| wave |  1486313.75 |
11867170.5


for 2018-2024, since 2025 is only through March.

In [23]:
print(df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum().loc['IOOS'].loc[:"2024"].mean().to_markdown(floatfmt=''))
print(df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum().loc['IOOS'].loc[:"2024"].mean().sum())
avg = df.groupby(by=["source", pd.Grouper(freq="YE")])[['met','wave']].sum().loc['IOOS'].loc[:"2024"].mean().sum()

print(f'On average, IOOS regions submit a total of {avg:.2f} (~13 million!) messages per year to the GTS.')

|      |                   0 |
|:-----|--------------------:|
| met  | 11299304.285714285  |
| wave |  1632791.4285714286 |
12932095.714285715
On average, IOOS regions submit a total of 12932095.71 (~13 million!) messages per year to the GTS.


## Total number of stations

In [24]:
print(len(df.locationID.unique()))

print(df.locationID.unique())

1039
['46108' 'AJXA2' 'CDXA2' ... '46236' '42354' 'CXLM2']


In [25]:
260/1039

0.2502406159769009

In [26]:
df.locationID

Unnamed: 0_level_0,locationID
time (UTC),Unnamed: 1_level_1
2018-01-01 00:00:00+00:00,46108
2018-01-01 00:00:00+00:00,AJXA2
2018-01-01 00:00:00+00:00,CDXA2
2018-01-01 00:00:00+00:00,ERXA2
2018-01-01 00:00:00+00:00,GIXA2
...,...
2025-04-01 00:00:00+00:00,OCSM2
2025-04-01 00:00:00+00:00,44097
2025-04-01 00:00:00+00:00,FRFN7
2025-04-01 00:00:00+00:00,44100


"IOOS contributes the largest number of NOS platforms to the GTS." For the latter, we'd need to identify "NOS" (not to confuse with an earlier comment in a different issue about defining NOS, in which case I was equating NOS with CO-OPS based on the non-NDBC report labels). In this case,
1. NOS = IOOS-regional (all) + non-NDBC (National ocean service, NOAA NOS PORTS, CBIBS, and NERRS.)

2. Perhaps add another condition where if there are 0s for stations, then omit from the station count?

3. And constrain the station count to only the Calendar Year 2024.

In [27]:
non_ndbc_nos_list = ['CHESAPEAKE BAY INTERPRETIVE BUOY SYSTEM',
'NATIONAL ESTUARINE RESEARCH RESERVE SYSTEM',
'NATIONAL OCEAN SERVICE',
'NOAA NOS PHYSICAL OCEANOGRAPHIC RT SYSTEM PROGRAM']

df_non_ndbc = df.loc[df['source']=='non-NDBC']

nos = df_non_ndbc[df_non_ndbc['sponsor'].isin(non_ndbc_nos_list)]
print('total NOS:',len(nos))
condition = (nos['met'] ==0) & (nos['wave']==0)
print('NOS empty',len(condition.loc[condition]))
nos = nos[~condition]
print('total NOS-empty:',len(nos))


ioos = df.loc[df['source']=='IOOS']
print('total IOOS:',len(ioos))
condition = (ioos['met'] ==0) & (ioos['wave']==0)
print('IOOS empty',len(condition.loc[condition]))
ioos = ioos[~condition]
print('total IOOS-empty:',len(ioos))

ioos_platform_count = len(ioos.locationID.unique())
nos_platform_count = len(nos.locationID.unique())

print(f'{ioos_platform_count} / {nos_platform_count + ioos_platform_count} = **{(ioos_platform_count/(nos_platform_count+ioos_platform_count))*100:.2f}%**')

total NOS: 28523
NOS empty 2627
total NOS-empty: 25896
total IOOS: 16186
IOOS empty 3638
total IOOS-empty: 12548
237 / 575 = **41.22%**


In [28]:
ioos = df.loc[df['source']=='IOOS']
ioos.loc[condition]['locationID'].unique()

array(['MRNA2', '41058', '41115', 'IMGP4', 'PTRP4', 'AUDP4', 'FRDP4',
       'VQSP4', 'YABP4', 'CQUC1', 'HBXC1', 'MYXC1', 'TDPC1', 'MBXC1',
       'BDXC1', 'FPXC1', 'GRBL1', '42043', '42046', '42047', 'LYBT2',
       '42067', '45179', '46246', 'DMNO3', '46118', '46119', '46128',
       '46121', '46122', '46123', '46125', '44024', '51203', '51204',
       '51207', '52211', '42023', 'BGCF1', 'MLSC1', '44022', '44040',
       '44034', 'ARPF1', '51201', '42044', '42045', '46096', '51208',
       'GIXA2', '46124', '46120', 'OKSI2', '45177', '52202', '45023',
       '45020', '45028', '51206', 'SSBN7', 'CDXA2', 'MBLA1', '52201',
       '44039', '45167', '51205', '41052', 'JAKI2', 'FSTI2', '45187',
       '45014', 'RLIT2', '45022', '45024', 'BSCA1', '46108', '51213',
       'MRSL1', '44095', 'SPLL1', '41033', 'AROP4', 'LDLC3', '45170',
       'KATA1', 'CRTA1', 'PPTA1', 'MHPA1', '45025', '52200', 'AJXA2',
       '41037', 'TWCO1', 'ERXA2', '44037', '41064', '51202', '51210',
       'SIPF1', '451

In [30]:
ioos = df.loc[df['source']=='IOOS']
#len(ioos.loc[(ioos['total_messages']!=0)].locationID.unique())

In [31]:
for loc in nos.locationID.unique():
  if len(nos.loc[nos['locationID']==loc].source.unique()) >1:
    print(loc,nos.loc[nos['locationID']==loc].source.unique())

In [32]:
count = 0
for sponsor in non_ndbc_nos_list:
  length = len(nos.loc[nos['sponsor']==sponsor].locationID.unique())
  print(f'{sponsor}: {length}')
  count = count + length

print(f'Total = {count}')
print(f'Total w/ IOOS = {count+ioos_platform_count}')
#len(nos.loc[nos['sponsor']=='CHESAPEAKE BAY INTERPRETIVE BUOY SYSTEM'].locationID.unique())

CHESAPEAKE BAY INTERPRETIVE BUOY SYSTEM: 8
NATIONAL ESTUARINE RESEARCH RESERVE SYSTEM: 35
NATIONAL OCEAN SERVICE: 283
NOAA NOS PHYSICAL OCEANOGRAPHIC RT SYSTEM PROGRAM: 83
Total = 409
Total w/ IOOS = 646


In [33]:
non_ndbc_nos_list = ['CHESAPEAKE BAY INTERPRETIVE BUOY SYSTEM',
'NATIONAL ESTUARINE RESEARCH RESERVE SYSTEM',
'NATIONAL OCEAN SERVICE',
'NOAA NOS PHYSICAL OCEANOGRAPHIC RT SYSTEM PROGRAM']

df_non_ndbc = df.loc[df['source']=='non-NDBC']
#print(len(df_non_ndbc))


filtered = df_non_ndbc[df_non_ndbc['sponsor'].isin(non_ndbc_nos_list)]
print('total NOS:',len(filtered))

nos = pd.concat([filtered,df.loc[df['source']=='IOOS']])
print('total IOOS:',len(nos.loc[nos['source']=='IOOS']))


# drop empty met and wave rows
condition = (nos['met'] ==0) & (nos['wave']==0)

nos = nos[~condition]
print('total IOOS-empty:',len(nos.loc[nos['source']=='IOOS']))
print('total NOS-empty:',len(nos.loc[nos['source']!='IOOS']))

ioos_platform_count = len(nos.loc[nos['source']=='IOOS'].locationID.unique())
nos_platform_count = len(nos.locationID.unique())

print(f'{ioos_platform_count} / {nos_platform_count} = **{(ioos_platform_count/nos_platform_count)*100:.2f}%**')

total NOS: 28523
total IOOS: 16186
total IOOS-empty: 12548
total NOS-empty: 25896
237 / 569 = **41.65%**


In [34]:
for loc in nos.locationID.unique():
  if len(nos.loc[nos['locationID']==loc].source.unique()) >1:
    print(loc,nos.loc[nos['locationID']==loc].source.unique())

BZST2 ['non-NDBC' 'IOOS']
EPTT2 ['non-NDBC' 'IOOS']
MGPT2 ['non-NDBC' 'IOOS']
TXPT2 ['non-NDBC' 'IOOS']
FPST2 ['non-NDBC' 'IOOS']
NUET2 ['non-NDBC' 'IOOS']


In [35]:
nos.loc[nos['sponsor']=='NATIONAL OCEAN SERVICE'].source.unique()

array(['non-NDBC', 'IOOS'], dtype=object)

In [36]:
len(nos.loc[nos['sponsor']=='NATIONAL ESTUARINE RESEARCH RESERVE SYSTEM'].locationID.unique())

35

In [37]:
len(nos.loc[nos['sponsor']=='NATIONAL OCEAN SERVICE'].locationID.unique())

283

In [38]:
len(nos.loc[nos['sponsor']=='NOAA NOS PHYSICAL OCEANOGRAPHIC RT SYSTEM PROGRAM'].locationID.unique())

83

In [39]:
df.loc[df['source']=='non-NDBC'].sponsor.unique()

array([nan, 'BP INC.', 'EPA & MEXICAN GOVERNMENT COOPERATIVE PROGRAM',
       'CHESAPEAKE BAY INTERPRETIVE BUOY SYSTEM',
       'SCRIPPS WAVERIDER COASTAL DATA INFORMATION PROGRAM',
       'EVERGLADES NATIONAL PARK', 'GREAT LAKES RESEARCH LABORATORY',
       'INTEGRATED CORAL OBSERVING NETWORK',
       'LOUISIANA OFFSHORE OIL PORT', 'MOSS LANDING MARINE LABORATORIES',
       'NATIONAL ESTUARINE RESEARCH RESERVE SYSTEM',
       'NATIONAL OCEAN SERVICE',
       'NATIONAL PARK SERVICE - LAKE MEAD NATIONAL REC AREA',
       'NATIONAL RENEWABLE ENERGY LABORATORY',
       'NATIONAL WEATHER SERVICE, ALASKA REGION',
       'NATIONAL WEATHER SERVICE, CENTRAL REGION',
       'NATIONAL WEATHER SERVICE, EASTERN REGION',
       'OCEAN OBSERVATORIES INITIATIVE', 'PETROBRAS', 'SHELL OIL',
       'U.S. ARMY CORPS OF ENGINEERS',
       'WOODS HOLE OCEANOGRAPHIC INSTITUTION', 'VERMONT EPSCOR',
       'NATIONAL PARK SERVICES - SLEEPING BEAR DUNES',
       'ALASKA OCEAN OBSERVING SYSTEM',
       'SUNY PLA

In [40]:
df_ioos = df.loc[(df['source']=='IOOS')]

condition = (df_ioos['met'] ==0) & (df_ioos['wave']==0)

len(df_ioos[~condition].locationID.unique())

237

In [41]:
df.locationID.unique()

condition = (df['met'] ==0) & (df['wave']==0)

zero_obs = df[condition]

no_obs_loc = []

for loc in zero_obs['locationID'].unique():
  if df.loc[(df['locationID'] == loc),'total'].any() == 0:
    no_obs_loc.append(loc)
    print(f'{loc} has no obs!')

print(f'This many platforms do not have observations for the entire record: {len(no_obs_loc)}')

41058 has no obs!
IMGP4 has no obs!
AUDP4 has no obs!
FRDP4 has no obs!
VQSP4 has no obs!
YABP4 has no obs!
CQUC1 has no obs!
HBXC1 has no obs!
MYXC1 has no obs!
TDPC1 has no obs!
MBXC1 has no obs!
BDXC1 has no obs!
FPXC1 has no obs!
GRBL1 has no obs!
42046 has no obs!
LYBT2 has no obs!
45179 has no obs!
DMNO3 has no obs!
46119 has no obs!
51203 has no obs!
51204 has no obs!
46124 has no obs!
PTAC1 has no obs!
MBPA1 has no obs!
42369 has no obs!
42375 has no obs!
42392 has no obs!
42887 has no obs!
SACV4 has no obs!
VERV4 has no obs!
41116 has no obs!
44093 has no obs!
LBRF1 has no obs!
MUKF1 has no obs!
42087 has no obs!
42088 has no obs!
42089 has no obs!
42090 has no obs!
BKTL1 has no obs!
CARL1 has no obs!
CPVM2 has no obs!
DMSF1 has no obs!
FRXM3 has no obs!
GCVF1 has no obs!
JXUF1 has no obs!
LTJF1 has no obs!
MCYF1 has no obs!
OBXC1 has no obs!
PMOA2 has no obs!
PNLM6 has no obs!
PRUR1 has no obs!
PTOA1 has no obs!
PXSC1 has no obs!
RCYF1 has no obs!
ULAM6 has no obs!
WBYA1 has 

In [43]:
df.loc[(df['locationID'] == 'TDPC1') & (df['total'] == 0)]

Unnamed: 0_level_0,Year,Month,locationID,region,sponsor,met,wave,source,total
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-01-01 00:00:00+00:00,2018,1,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2018-02-01 00:00:00+00:00,2018,2,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2018-03-01 00:00:00+00:00,2018,3,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2018-04-01 00:00:00+00:00,2018,4,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2018-05-01 00:00:00+00:00,2018,5,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
...,...,...,...,...,...,...,...,...,...
2024-12-01 00:00:00+00:00,2024,12,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2025-01-01 00:00:00+00:00,2025,1,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2025-02-01 00:00:00+00:00,2025,2,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0
2025-03-01 00:00:00+00:00,2025,3,TDPC1,CeNCOOS,CENTRAL AND NORTHERN CALIFORNIA OCEAN OBSERVIN...,0,0,IOOS,0


To confirm that number, here we search for each of those platform IDs and calculate some statistics. If the statistics are all zero for `met`, `wave`, and `total`, then we don't have observations for all those platforms.

In [44]:
len(df.loc[(df['locationID'].isin(no_obs_loc)) & (df.index>='2024-01-01') & (df.index<='2024-12-31') & (df['source'] == 'IOOS')].locationID.unique())

15

In [47]:
df.to_csv('gts_metrics.csv', index=False)

# From Gemini

Prompt:
1. ```Can you write a few statements about the importance of the data sourced from IOOS in relation to the entire dataset?```
2. ``` What about framing it around the total number of messages sent to the GTS by each source? ```


In [48]:
# Create the 'total_messages' column
df['total_messages'] = df['met'] + df['wave']

# Group by source and sum 'total_messages'
messages_by_source = df.groupby('source')['total_messages'].sum().reset_index()

# Calculate the grand total of messages
grand_total_messages = messages_by_source['total_messages'].sum()

# Calculate the percentage for each source
messages_by_source['percentage'] = (messages_by_source['total_messages'] / grand_total_messages) * 100

# Sort by total messages descending for better presentation
messages_by_source = messages_by_source.sort_values(by='total_messages', ascending=False)

# Print the results
print("Total Messages Sent to GTS (met + wave) by Source:")
print(messages_by_source.to_markdown(index=False, numalign="left", stralign="left", floatfmt=".2f"))

# Store the IOOS percentage for easy reference in the text
ioos_percentage = messages_by_source.loc[messages_by_source['source'] == 'IOOS', 'percentage'].iloc[0]
ioos_total_messages = messages_by_source.loc[messages_by_source['source'] == 'IOOS', 'total_messages'].iloc[0]
non_ndbc_total_messages = messages_by_source.loc[messages_by_source['source'] == 'non-NDBC', 'total_messages'].iloc[0]

Total Messages Sent to GTS (met + wave) by Source:
| source   | total_messages   | percentage   |
|:---------|:-----------------|:-------------|
| non-NDBC | 395274410        | 70.21        |
| IOOS     | 94937364         | 16.86        |
| NDBC     | 72744224         | 12.92        |


3. ```Can you also frame it around source and platformID?```

In [49]:
# Assuming 'platformID' refers to the 'locationID' column as 'platformID' does not exist in the data.
# Group by source and count unique locationIDs
locations_per_source = df.groupby('source')['locationID'].nunique().reset_index()
locations_per_source = locations_per_source.rename(columns={'locationID': 'unique_locations'})

# Merge this with the previously calculated messages_by_source DataFrame
# (Reload messages_by_source calculation for robustness in case kernel state changed)
df['total_messages'] = df['met'] + df['wave']
messages_by_source = df.groupby('source')['total_messages'].sum().reset_index()
source_summary = pd.merge(messages_by_source, locations_per_source, on='source')

# Calculate average messages per location for each source
source_summary['avg_messages_per_location'] = source_summary['total_messages'] / source_summary['unique_locations']

# Sort by total messages descending
source_summary = source_summary.sort_values(by='total_messages', ascending=False)

# Print the summary table
print("Summary by Source (including LocationID count and Avg Messages):")
# Displaying integer values for messages for clarity
print(source_summary.to_markdown(index=False, numalign="left", stralign="left", floatfmt=",.0f"))

Summary by Source (including LocationID count and Avg Messages):
| source   | total_messages   | unique_locations   | avg_messages_per_location   |
|:---------|:-----------------|:-------------------|:----------------------------|
| non-NDBC | 395274410        | 676                | 584,725                     |
| IOOS     | 94937364         | 260                | 365,144                     |
| NDBC     | 72744224         | 155                | 469,318                     |
