# OIH Dashboard pre-processor query

This notebook demonstrates query approach for the pre-processed resources from the OIH Graph


In [1]:
import duckdb

## Pre-processed OIH Graph

In [2]:
## load the combined graph
url = "http://ossapi.oceaninfohub.org/public/combined.parquet"
duckdb.install_extension("httpfs")

# Instantiate the DuckDB connection
con = duckdb.connect()
# con.execute("CREATE TABLE my_table AS SELECT * FROM read_parquet('{}')".format(url))  # load from url
con.execute("CREATE TABLE my_table AS SELECT * FROM read_parquet('../../secret/combined.parquet')") # load from local parquet


<duckdb.DuckDBPyConnection at 0x7f220018b3f0>

In [3]:

# Now you can execute SQL queries on the Parquet file as if it was a regular table
# r = con.execute("SELECT DISTINCT provder FROM my_table").fetchdf()
# r = con.execute(" SELECT DISTINCT provder, type, ANY_VALUE(s),  COUNT(*) AS count FROM my_table GROUP BY provder, type  order by count desc").fetchdf()
r = con.execute(" SELECT DISTINCT provder, type, COUNT(*) AS count FROM my_table GROUP BY provder, type").fetchdf()

print(r)


                provder                      type   count
0          oceanexperts              schmea:Event   20606
1          oceanexperts             schmea:Course     491
2          oceanexperts     schmea:CourseInstance     491
3                   pdh  schemawrong:Organization    3562
4                   pdh       schemawrong:Dataset   32807
5              aquadocs       schmea:CreativeWork  261364
6              aquadocs             schmea:Person  105494
7              aquadocs       schmea:Organization   15198
8      invemardocuments             schmea:Person   13351
9      invemardocuments       schmea:CreativeWork   18647
10       invemarexperts             schmea:Person    1172
11  invemarinstitutions       schmea:Organization     269
12      invemartraining             schmea:Course     452
13      invemartraining       schmea:Organization     452
14      invemartraining       schmea:CreativeWork       1
15       invemarvessels            schmea:Vehicle      85
16       marin

In [4]:
r = con.execute(" SELECT keywords, COUNT(*) AS count FROM my_table WHERE keywords <> 'NaN' GROUP BY keywords order by count desc").fetchdf()
print(r)


                        keywords  count
0                      Fisheries  14646
1                        Biology   6946
2                         Oceans   4377
3                    Aquaculture   3950
4                     Occurrence   3623
...                          ...    ...
65044                    GLONASS      1
65045  Environmental mutagenesis      1
65046    Evaporation of droplets      1
65047                  génétique      1
65048            Oramiri-Ukwa R.      1

[65049 rows x 2 columns]


In [12]:
r = con.execute(" SELECT SUM(count) AS total_count FROM ( SELECT DISTINCT keywords, COUNT(*) AS count FROM my_table WHERE keywords <> 'NaN' GROUP BY keywords order by count desc) AS counts").fetchdf()
print(r)

   total_count
0     407099.0


In [13]:
r = con.execute(" SELECT type, COUNT(*) AS count FROM my_table GROUP BY type order by count desc").fetchdf()
print(r)

                        type   count
0        schmea:CreativeWork  288427
1              schmea:Person  166284
2        schemawrong:Dataset  108245
3        schmea:Organization   48119
4               schmea:Event   20665
5             schmea:Dataset   20356
6   schemawrong:Organization    5703
7     schmea:ResearchProject    3608
8              schmea:Course    1400
9         schemawrong:Person    1372
10     schmea:CourseInstance    1011
11            schmea:Vehicle     115


## graph parquet


In [14]:
url = "http://ossapi.oceaninfohub.org/public/oihgraph_rdf.parquet"
duckdb.install_extension("httpfs")

# Instantiate the DuckDB connection
con2 = duckdb.connect()
# con.execute("CREATE TABLE my_table AS SELECT * FROM read_parquet('{}')".format(url))  # load from url
con2.execute("CREATE TABLE my_table AS SELECT * FROM read_parquet('../../secret/oihgraph_rdf.parquet')") # load from local parquet



<duckdb.DuckDBPyConnection at 0x7fedaae82730>

In [16]:

r = con2.execute(" SELECT DISTINCT predicate,  COUNT(*) AS count FROM my_table GROUP BY predicate order by count desc").fetchdf()
print(r)

                                             predicate   count
0    <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>  634761
1                            <https://schema.org/name>  317029
2                        <https://schema.org/keywords>  306362
3                             <https://schema.org/url>  189343
4                     <https://schema.org/description>  135034
..                                                 ...     ...
139                <https://schema.org/productionDate>       6
140                   <https://schema.org/contactType>       3
141                      <http://schema.org/legalName>       3
142                   <https://schema.org/relatedLink>       3
143                   <https://schema.org/EventStatus>       1

[144 rows x 2 columns]
