# Test AIS-geoparquet with duckdb

In [None]:
path = 'az://kartaistorage.blob.core.windows.net/skygeo/kystverket/ais_geoparquet_open_data/*/*.parquet'

# explore the large hive partitioned geoparquet dataset at the given path using duckdb
import duckdb
import time
con = duckdb.connect()

start1 = time.time()
result1 = con.execute(f"SELECT * FROM read_parquet('{path}') LIMIT 10").fetchdf()
end1 = time.time()
print(f"Query 1 execution time: {end1 - start1:.2f} seconds")
print("Query 1 results:")
print(result1)
## Query 1 execution time: 1.07 seconds

start2 = time.time()
result2 = con.execute(f"SELECT COUNT(*) FROM read_parquet('{path}')").fetchdf()
end2 = time.time()
print(f"Query 2 execution time: {end2 - start2:.2f} seconds")
print("Query 2 results:")
print(result2)
## Query 2 execution time: 19.97 seconds

start3 = time.time()
result3 = con.execute(f"SELECT DISTINCT mmsi FROM read_parquet('{path}') LIMIT 10").fetchdf()
end3 = time.time()
print(f"Query 3 execution time: {end3 - start3:.2f} seconds")
print("Query 3 results:")
print(result3)
## Query 3 execution time: 102.28 seconds


# start4 = time.time()
# result4 = con.execute(f"SELECT mmsi, COUNT(*) as cnt FROM read_parquet('{path}') GROUP BY mmsi ORDER BY cnt DESC LIMIT 10").fetchdf()
# end4 = time.time()
# print(f"Query 4 execution time: {end4 - start4:.2f} seconds")
# print("Query 4 results:")
# print(result4)
## Query 4 execution time: 101.91 seconds


start5 = time.time()
result5 = con.execute(f"SELECT * FROM read_parquet('{path}') WHERE mmsi=257828000 LIMIT 10").fetchdf()
end5 = time.time()
print(f"Query 5 execution time: {end5 - start5:.2f} seconds")
print("Query 5 results:")
print(result5)
## Query 5 execution time: 2.29 seconds

con.close()


Query 1 execution time: 1.07 seconds
Query 1 results:
              date_time_utc       mmsi dsrc      imo     ship_name  ship_type  \
0 2024-01-01 23:37:19+01:00  352002289    G  9944144  NORD VOLANTE         89   
1 2024-01-01 23:37:08+01:00  352002289    G  9944144  NORD VOLANTE         89   
2 2024-01-01 23:36:59+01:00  352002289    G  9944144  NORD VOLANTE         89   
3 2024-01-01 23:36:59+01:00  352002289    S  9944144  NORD VOLANTE         89   
4 2024-01-01 23:36:48+01:00  352002289    G  9944144  NORD VOLANTE         89   
5 2024-01-01 23:36:19+01:00  352002289    G  9944144  NORD VOLANTE         89   
6 2024-01-01 23:36:37+01:00  352002289    G  9944144  NORD VOLANTE         89   
7 2024-01-01 23:36:29+01:00  352002289    G  9944144  NORD VOLANTE         89   
8 2024-01-01 23:35:59+01:00  352002289    G  9944144  NORD VOLANTE         89   
9 2024-01-01 23:36:08+01:00  352002289    G  9944144  NORD VOLANTE         89   

  callsign  maneuvre       lon        lat  ...  speed 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Query 2 execution time: 19.97 seconds
Query 2 results:
   count_star()
0    1196747539


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Query 3 execution time: 102.28 seconds
Query 3 results:
        mmsi
0  259690000
1  259032000
2  257088050
3  257085210
4  259655000
5  258999000
6  258242000
7  311000623
8  259121000
9  257952600


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Query 5 execution time: 2.29 seconds
Query 5 results:
              date_time_utc       mmsi dsrc      imo ship_name  ship_type  \
0 2024-01-01 01:41:40+01:00  257828000    G  8765280  INSPIRER         99   
1 2024-01-01 03:54:49+01:00  257828000    G  8765280  INSPIRER         99   
2 2024-01-01 06:08:09+01:00  257828000    G  8765280  INSPIRER         99   
3 2024-01-01 07:14:50+01:00  257828000    G  8765280  INSPIRER         99   
4 2024-01-01 08:21:00+01:00  257828000    G  8765280  INSPIRER         99   
5 2024-01-01 05:52:50+01:00  257828000    G  8765280  INSPIRER         99   
6 2024-01-01 06:59:29+01:00  257828000    G  8765280  INSPIRER         99   
7 2024-01-01 06:11:59+01:00  257828000    G  8765280  INSPIRER         99   
8 2024-01-01 08:24:49+01:00  257828000    G  8765280  INSPIRER         99   
9 2024-01-01 02:54:39+01:00  257828000    G  8765280  INSPIRER         99   

  callsign  maneuvre       lon        lat  ...  speed    cog  true_heading  \
0    LAON8         0

In [15]:
path = 'az://kartaistorage.blob.core.windows.net/skygeo/kystverket/ais_geoparquet_open_data/*/*.parquet'

# explore the large hive partitioned geoparquet dataset at the given path using duckdb
import duckdb
import time
con = duckdb.connect()

start4 = time.time()
result4 = con.execute(f"SELECT mmsi, COUNT(*) as cnt FROM read_parquet('{path}') GROUP BY mmsi ORDER BY cnt DESC LIMIT 10").fetchdf()
end4 = time.time()
print(f"Query 4 execution time: {end4 - start4:.2f} seconds")
print("Query 4 results:")
print(result4)

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Query 4 execution time: 101.91 seconds
Query 4 results:
        mmsi      cnt
0  257568600  2253655
1  259475000  2159590
2  259651000  2030989
3  258585000  2006580
4  258063000  1964550
5  257161000  1963712
6  257089530  1694440
7  257109000  1651505
8  257500000  1629688
9  257565600  1557699
