# Konventering fra parquet til geoparquet


In [33]:
import pandas as pd
import pyarrow as pr
import pyarrow.parquet as pq
import geopandas as pg
from shapely.geometry import Point
import folium

In [19]:
file = pd.read_parquet('data/hais_2024-01-01.snappy.parquet')

In [24]:
print(file.columns)

Index(['date_time_utc', 'mmsi', 'longitude', 'latitude', 'status',
       'course_over_ground', 'speed_over_ground', 'rate_of_turn', 'maneuvre',
       'imo', 'callsign', 'ship_name', 'ship_type', 'length', 'draught',
       'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry'],
      dtype='object')


In [28]:
geometry = [Point(xy) for xy in zip(file['longitude'], file['latitude'])]
gdf = pg.GeoDataFrame(file, geometry=geometry)
gdf.to_parquet('data/converted_parquet_file.geoparquet')
gdf_read = pg.read_parquet('data/converted_parquet_file.geoparquet')
print(gdf_read.head(10))

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 23:03:31  257038950   8.382945  58.247200       0   
1 2024-01-01 23:03:39  257038950   8.382945  58.247200       0   
2 2024-01-01 23:03:51  257038950   8.382945  58.247200       0   
3 2024-01-01 23:04:01  257038950   8.382943  58.247200       0   
4 2024-01-01 23:04:10  257038950   8.382940  58.247204       0   
5 2024-01-01 23:04:21  257038950   8.382943  58.247200       0   
6 2024-01-01 23:04:31  257038950   8.382945  58.247204       0   
7 2024-01-01 23:04:39  257038950   8.382947  58.247204       0   
8 2024-01-01 23:04:50  257038950   8.382947  58.247208       0   
9 2024-01-01 23:05:01  257038950   8.382947  58.247204       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  \
0               360.0                0.0          -128         0    0   
1               360.0                0.0          -128         0    0   
2               360.0                0.1          -128

In [9]:
print(gdf_read.columns)

Index(['date_time_utc', 'mmsi', 'longitude', 'latitude', 'status',
       'course_over_ground', 'speed_over_ground', 'rate_of_turn', 'maneuvre',
       'imo', 'callsign', 'ship_name', 'ship_type', 'length', 'draught',
       'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry'],
      dtype='object')


In [23]:
specific_lat = 40.0
specific_lon = -70.0
reference_point = Point(specific_lon, specific_lat)
closest_point = gdf_read.iloc[gdf_read.distance(reference_point).idxmin()]
print(closest_point)

m = folium.Map(locations=[closest_point.latitude, closest_point.longitude], zoom_start=10)
folium.Marker(
    [closest_point.latitude, closest_point.longitude],
    popup=f"Ship: {closest_point.ship_name}\nMMSI: {closest_point.mmsi}"
).add_to(m)
m

date_time_utc               2024-01-01 09:45:28
mmsi                                  352002289
longitude                              6.543423
latitude                               57.47652
status                                        0
course_over_ground                        262.1
speed_over_ground                          11.7
rate_of_turn                                  7
maneuvre                                      0
imo                                     9944144
callsign                                 3E3506
ship_name                          NORD VOLANTE
ship_type                                    89
length                                      183
draught                                   11.30
data_source                                   G
ais_class                                     A
hex_7                        608154189368918015
hex_14                       639679386753068359
geometry              POINT (6.543423 57.47652)
Name: 283571, dtype: object


# Partisjonering

In [42]:
# Leser geoparquet
geoparquet_file = pd.read_parquet('data/converted_parquet_file.geoparquet')

In [43]:
# Deler opp kolonnen date_time_utc til bare date og time.
geoparquet_file['date'] = geoparquet_file['date_time_utc'].dt.date

geoparquet_file['hour'] = geoparquet_file['date_time_utc'].dt.hour

In [44]:
print(geoparquet_file.head(10))

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 23:03:31  257038950   8.382945  58.247200       0   
1 2024-01-01 23:03:39  257038950   8.382945  58.247200       0   
2 2024-01-01 23:03:51  257038950   8.382945  58.247200       0   
3 2024-01-01 23:04:01  257038950   8.382943  58.247200       0   
4 2024-01-01 23:04:10  257038950   8.382940  58.247204       0   
5 2024-01-01 23:04:21  257038950   8.382943  58.247200       0   
6 2024-01-01 23:04:31  257038950   8.382945  58.247204       0   
7 2024-01-01 23:04:39  257038950   8.382947  58.247204       0   
8 2024-01-01 23:04:50  257038950   8.382947  58.247208       0   
9 2024-01-01 23:05:01  257038950   8.382947  58.247204       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  ...  \
0               360.0                0.0          -128         0    0  ...   
1               360.0                0.0          -128         0    0  ...   
2               360.0                0.

In [35]:
#conventerer speed_over_ground til string
# geoparquet_file['speed_over_ground_str'] = geoparquet_file['speed_over_ground'].astype(str)

In [32]:
# Konventerer til pyarrow tabell
table = pr.Table.from_pandas(geoparquet_file)

In [37]:
# Partisjonering
pq.write_to_dataset(
    table,
    root_path='data/utdata_mappe',
    partition_cols=['date', 'hour', 'ship_name']
)

# Filtrering