# WMO Duckdb 

In [29]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
import pandas as pd
import geopandas as gpd
from shapely import wkt
import fiona
import s3fs
import pyarrow.parquet as pq
import shapely
import os
import json
import re
import duckdb

In [14]:
def to_wkt(polygon_string):
    # split the input string into pairs
    pairs = polygon_string.split(',')

    # transform each pair into 'y x' format
    # transformed_pairs = [' '.join(reversed(pair.split())) for pair in pairs]
    transformed_pairs = [' '.join(pair.split()) for pair in pairs]


    # join the transformed pairs with a comma and a space
    transformed_string = ', '.join(transformed_pairs)

    # return the final WKT string
    return f"POLYGON (({transformed_string}))"

In [15]:
createCmd = '''CREATE TABLE base (id VARCHAR, type VARCHAR, name VARCHAR, url VARCHAR, description VARCHAR, headline VARCHAR, g VARCHAR );
CREATE TABLE dataset (id VARCHAR, type VARCHAR, sameAs VARCHAR, license VARCHAR, citation VARCHAR, keyword VARCHAR, includedInDataCatalog VARCHAR, distribution VARCHAR, region VARCHAR, provider VARCHAR, publisher VARCHAR, creator VARCHAR);
CREATE TABLE sup_time (id VARCHAR, type VARCHAR, time VARCHAR, temporalCoverage VARCHAR, dateModified VARCHAR, datePublished VARCHAR, );

COPY base FROM '/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_baseQuery.parquet';
COPY dataset FROM '/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_dataset.parquet';
COPY sup_time FROM '/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_sup_temporal.parquet';

CREATE TABLE course AS SELECT * FROM read_parquet('/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_course.parquet',  union_by_name=true);
CREATE TABLE person AS SELECT * FROM read_parquet('/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_person.parquet',  union_by_name=true);
CREATE TABLE sup_geo AS SELECT * FROM read_parquet('/home/fils/src/Projects/OIH/odis-arch/graphOps/extraction/mdp/output/*_sup_geo.parquet',  union_by_name=true);
'''


In [16]:

sqlCmd = '''SELECT base_agg.id, base_agg.type_list, base_agg.name_list, dataset_agg.kw_list,
        base_agg.b_url, base_agg.b_desc, base_agg.b_headline, geo_agg.geom_list,
        temporal_agg.tc_list, temporal_agg.dp_list
FROM (
    SELECT id, STRING_AGG(DISTINCT type, ', ') AS type_list, STRING_AGG(DISTINCT name, ', ') AS name_list,
           any_value(url) AS b_URL, any_value(description) AS b_desc, any_value(headline) AS b_headline
    FROM base
    GROUP BY  id
) AS base_agg
JOIN (
    SELECT id, ANY_VALUE(includedInDataCatalog), STRING_AGG(DISTINCT keyword, ', ') AS kw_list
    FROM dataset
    GROUP BY  id
) AS dataset_agg
    ON base_agg.id = dataset_agg.id
JOIN (
    SELECT id,  STRING_AGG(DISTINCT geom, ', ') AS geom_list
    FROM sup_geo
    GROUP BY  id
) AS geo_agg
    ON base_agg.id = geo_agg.id
JOIN (
    SELECT id,  STRING_AGG(DISTINCT temporalCoverage, ', ') AS tc_list,  STRING_AGG(DISTINCT datePublished, ', ') AS dp_list
    FROM sup_time
    GROUP BY  id
) AS temporal_agg
ON   base_agg.id = temporal_agg.id
ORDER By base_agg.id;
'''

In [17]:
con = duckdb.connect()
con.execute(createCmd)  # load from url

<duckdb.duckdb.DuckDBPyConnection at 0x7f4d43ae41b0>

In [18]:
df = con.execute(sqlCmd).fetchdf()

In [19]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12163 entries, 0 to 12162
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          12163 non-null  object
 1   type_list   12163 non-null  object
 2   name_list   12163 non-null  object
 3   kw_list     12163 non-null  object
 4   b_URL       12163 non-null  object
 5   b_desc      12163 non-null  object
 6   b_headline  12163 non-null  object
 7   geom_list   12163 non-null  object
 8   tc_list     12163 non-null  object
 9   dp_list     12163 non-null  object
dtypes: object(10)
memory usage: 950.4+ KB
None


In [20]:
df.head(10)

Unnamed: 0,id,type_list,name_list,kw_list,b_URL,b_desc,b_headline,geom_list,tc_list,dp_list
0,https://catalogue.cioos.ca/dataset/0007ee4e-0c...,https://schema.org/Dataset,Boundary Pass Conductivity Temperature Depth D...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/0007ee4e-0c...,The Sea-Bird Microcat SBE37SMP-ODO 9536 was de...,,48.7662667 -123.0393167 48.7662667 -123.0393167,2018-10-26/2019-05-19,2023-03-14T15:55:57.922767
1,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Oxygen, Subsurface temperature, volume_fractio...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-01-09T22:53:53.192781
2,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Oxygen Sensor Deployed ...,"Oxygène, Oceans, sea_water_temperature, Tempér...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"49.04314848, -123.31687331",2020-03-06/2020-09-24,2023-12-05T06:23:13.414755
3,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Subsurface temperature, volume_fraction_of_oxy...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-02-11T01:21:47.832449
4,https://catalogue.cioos.ca/dataset/003ead14-1f...,https://schema.org/Dataset,Patricia Bay Conductivité/Température/Profonde...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/003ead14-1f...,The Sea-Bird SeaCAT SBE16plus V2 6536 was depl...,,48.65136 -123.4864183333 48.65136 -123.4864183333,2011-10-02/2012-02-23,2023-05-16T20:06:51.177217
5,https://catalogue.cioos.ca/dataset/003ead14-1f...,https://schema.org/Dataset,Patricia Bay Conductivité/Température/Profonde...,"Température sous la surface, sea_water_tempera...",https://catalogue.cioos.ca/dataset/003ead14-1f...,The Sea-Bird SeaCAT SBE16plus V2 6536 was depl...,,48.65136 -123.4864183333 48.65136 -123.4864183333,2011-10-02/2012-02-23,2023-06-09T21:58:39.062861
6,https://catalogue.cioos.ca/dataset/00863729-b5...,https://schema.org/Dataset,Barkley Canyon Upper Slope Capteur d'Oxygène d...,"Température sous la surface, sea_water_tempera...",https://catalogue.cioos.ca/dataset/00863729-b5...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,48.427373 -126.1743 48.427373 -126.1743,2019-05-16/2019-09-07,2023-03-14T16:03:01.039560
7,https://catalogue.cioos.ca/dataset/0092e8bc-e9...,https://schema.org/Dataset,Ferme Cormorant de Cascadia Seaweed Turbidimèt...,"Particulate matter, Matière particulaire, sea_...",https://catalogue.cioos.ca/dataset/0092e8bc-e9...,The Turner Cyclops-7F Fluorometer (S/N 900279)...,,"49.25736, -125.92503",2022-05-25/2023-04-26,2024-01-09T23:09:29.611338
8,https://catalogue.cioos.ca/dataset/0092e8bc-e9...,https://schema.org/Dataset,Ferme Cormorant de Cascadia Seaweed Turbidimèt...,"Particulate matter, Matière particulaire, sea_...",https://catalogue.cioos.ca/dataset/0092e8bc-e9...,The Turner Cyclops-7F Fluorometer (S/N 900279)...,,"49.25736, -125.92503",2022-05-25/2023-04-26,2023-12-05T06:29:23.231319
9,https://catalogue.cioos.ca/dataset/0092e8bc-e9...,https://schema.org/Dataset,Cascadia Seaweed Cormorant Farm Turbidity Mete...,"Particulate matter, Matière particulaire, sea_...",https://catalogue.cioos.ca/dataset/0092e8bc-e9...,The Turner Cyclops-7F Fluorometer (S/N 900279)...,,"49.25736, -125.92503",2022-05-25/2023-04-26,2024-02-11T00:53:55.913757


In [21]:
df['WKT'] = df['geom_list'].apply(to_wkt)


In [22]:
df.head()

Unnamed: 0,id,type_list,name_list,kw_list,b_URL,b_desc,b_headline,geom_list,tc_list,dp_list,WKT
0,https://catalogue.cioos.ca/dataset/0007ee4e-0c...,https://schema.org/Dataset,Boundary Pass Conductivity Temperature Depth D...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/0007ee4e-0c...,The Sea-Bird Microcat SBE37SMP-ODO 9536 was de...,,48.7662667 -123.0393167 48.7662667 -123.0393167,2018-10-26/2019-05-19,2023-03-14T15:55:57.922767,POLYGON ((48.7662667 -123.0393167 48.7662667 -...
1,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Oxygen, Subsurface temperature, volume_fractio...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-01-09T22:53:53.192781,"POLYGON ((-123.31687331, 49.04314848))"
2,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Oxygen Sensor Deployed ...,"Oxygène, Oceans, sea_water_temperature, Tempér...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"49.04314848, -123.31687331",2020-03-06/2020-09-24,2023-12-05T06:23:13.414755,"POLYGON ((49.04314848, -123.31687331))"
3,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Subsurface temperature, volume_fraction_of_oxy...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-02-11T01:21:47.832449,"POLYGON ((-123.31687331, 49.04314848))"
4,https://catalogue.cioos.ca/dataset/003ead14-1f...,https://schema.org/Dataset,Patricia Bay Conductivité/Température/Profonde...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/003ead14-1f...,The Sea-Bird SeaCAT SBE16plus V2 6536 was depl...,,48.65136 -123.4864183333 48.65136 -123.4864183333,2011-10-02/2012-02-23,2023-05-16T20:06:51.177217,POLYGON ((48.65136 -123.4864183333 48.65136 -1...


In [23]:
df.to_parquet('./output/temp.parquet') # needs to be done before geometry conversion/column add

In [24]:
def load_wkt(row):
    try:
        return wkt.loads(row)
    except Exception:
        return None


In [25]:
df['geometry'] = df['WKT'].apply(load_wkt)


IllegalArgumentException: point array must contain 0 or >1 elements

ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
IllegalArgumentException: point array must contain 0 or >1 elements

IllegalArgumentException: point array must contain 0 or >1 elements

IllegalArgumentException: point array must contain 0 or >1 elements

ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
IllegalArgumentException: point array must contain 0 or >1 elements

ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number but encountered ','
ParseException: Expected number

In [26]:
gdf = gpd.GeoDataFrame(df, geometry='geometry')

In [27]:
gdf


Unnamed: 0,id,type_list,name_list,kw_list,b_URL,b_desc,b_headline,geom_list,tc_list,dp_list,WKT,geometry
0,https://catalogue.cioos.ca/dataset/0007ee4e-0c...,https://schema.org/Dataset,Boundary Pass Conductivity Temperature Depth D...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/0007ee4e-0c...,The Sea-Bird Microcat SBE37SMP-ODO 9536 was de...,,48.7662667 -123.0393167 48.7662667 -123.0393167,2018-10-26/2019-05-19,2023-03-14T15:55:57.922767,POLYGON ((48.7662667 -123.0393167 48.7662667 -...,
1,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Oxygen, Subsurface temperature, volume_fractio...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-01-09T22:53:53.192781,"POLYGON ((-123.31687331, 49.04314848))",
2,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Oxygen Sensor Deployed ...,"Oxygène, Oceans, sea_water_temperature, Tempér...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"49.04314848, -123.31687331",2020-03-06/2020-09-24,2023-12-05T06:23:13.414755,"POLYGON ((49.04314848, -123.31687331))",
3,https://catalogue.cioos.ca/dataset/00347450-75...,https://schema.org/Dataset,Strait of Georgia East Capteur d'Oxygène déplo...,"Subsurface temperature, volume_fraction_of_oxy...",https://catalogue.cioos.ca/dataset/00347450-75...,The Sea-Bird SBE 63 Dissolved Oxygen Sensor 63...,,"-123.31687331, 49.04314848",2020-03-06/2020-09-24,2024-02-11T01:21:47.832449,"POLYGON ((-123.31687331, 49.04314848))",
4,https://catalogue.cioos.ca/dataset/003ead14-1f...,https://schema.org/Dataset,Patricia Bay Conductivité/Température/Profonde...,"speed_of_sound_in_sea_water, Subsurfacesalinit...",https://catalogue.cioos.ca/dataset/003ead14-1f...,The Sea-Bird SeaCAT SBE16plus V2 6536 was depl...,,48.65136 -123.4864183333 48.65136 -123.4864183333,2011-10-02/2012-02-23,2023-05-16T20:06:51.177217,POLYGON ((48.65136 -123.4864183333 48.65136 -1...,
...,...,...,...,...,...,...,...,...,...,...,...,...
12158,https://raw.githubusercontent.com/iodepo/odis-...,https://schema.org/Dataset,World Ocean Database - Multi-cast file: 2019/w...,,https://noaa-wod-pds.s3.amazonaws.com/,Data for multiple casts from the World Ocean D...,,31.332000732421875 -68.84600067138672 19.26699...,,,POLYGON ((31.332000732421875 -68.8460006713867...,
12159,https://raw.githubusercontent.com/iodepo/odis-...,https://schema.org/Dataset,World Ocean Database - Multi-cast file: 2020/w...,,https://noaa-wod-pds.s3.amazonaws.com/,Data for multiple casts from the World Ocean D...,,31.131000518798828 -66.23729705810547 27.20498...,,,POLYGON ((31.131000518798828 -66.2372970581054...,
12160,https://raw.githubusercontent.com/iodepo/odis-...,https://schema.org/Dataset,World Ocean Database - Multi-cast file: 2021/w...,,https://noaa-wod-pds.s3.amazonaws.com/,Data for multiple casts from the World Ocean D...,,-180.0 -70.05599975585938 180.0 64.36199951171875,,,POLYGON ((-180.0 -70.05599975585938 180.0 64.3...,
12161,https://raw.githubusercontent.com/iodepo/odis-...,https://schema.org/Dataset,World Ocean Database - Multi-cast file: 2022/w...,,https://noaa-wod-pds.s3.amazonaws.com/,Data for multiple casts from the World Ocean D...,,31.33099937438965 -70.18099975585938 25.157012...,,,POLYGON ((31.33099937438965 -70.18099975585938...,


In [31]:
i = 0
gdf_row_geojson = gdf.iloc[i:i + 1].geometry.to_json()
print(gdf_row_geojson)

{"type": "FeatureCollection", "features": [{"id": "0", "type": "Feature", "properties": {}, "geometry": null, "bbox": null}], "bbox": [NaN, NaN, NaN, NaN]}


  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy
