# Konventering fra parquet til geoparquet


In [138]:
import pandas as pd
import pyarrow as pr
import pyarrow.parquet as pq
import geopandas as pg
from shapely.geometry import Point
from shapely import wkb
import folium
import pyarrow.dataset as ds
import os
import time

In [139]:
file = pd.read_parquet('data/hais_2024-01-01.snappy.parquet')
print(file.head(10))

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 23:03:31  257038950   8.382945  58.247200       0   
1 2024-01-01 23:03:39  257038950   8.382945  58.247200       0   
2 2024-01-01 23:03:51  257038950   8.382945  58.247200       0   
3 2024-01-01 23:04:01  257038950   8.382943  58.247200       0   
4 2024-01-01 23:04:10  257038950   8.382940  58.247204       0   
5 2024-01-01 23:04:21  257038950   8.382943  58.247200       0   
6 2024-01-01 23:04:31  257038950   8.382945  58.247204       0   
7 2024-01-01 23:04:39  257038950   8.382947  58.247204       0   
8 2024-01-01 23:04:50  257038950   8.382947  58.247208       0   
9 2024-01-01 23:05:01  257038950   8.382947  58.247204       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  \
0               360.0                0.0          -128         0    0   
1               360.0                0.0          -128         0    0   
2               360.0                0.1          -128

In [140]:
print(file.columns)

Index(['date_time_utc', 'mmsi', 'longitude', 'latitude', 'status',
       'course_over_ground', 'speed_over_ground', 'rate_of_turn', 'maneuvre',
       'imo', 'callsign', 'ship_name', 'ship_type', 'length', 'draught',
       'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry'],
      dtype='object')


In [141]:
geometry = [Point(xy) for xy in zip(file['longitude'], file['latitude'])]
gdf = pg.GeoDataFrame(file, geometry=geometry)
gdf.to_parquet('data/converted_parquet_file.geoparquet')
gdf_read = pg.read_parquet('data/converted_parquet_file.geoparquet')
print(gdf_read.head(10))

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 23:03:31  257038950   8.382945  58.247200       0   
1 2024-01-01 23:03:39  257038950   8.382945  58.247200       0   
2 2024-01-01 23:03:51  257038950   8.382945  58.247200       0   
3 2024-01-01 23:04:01  257038950   8.382943  58.247200       0   
4 2024-01-01 23:04:10  257038950   8.382940  58.247204       0   
5 2024-01-01 23:04:21  257038950   8.382943  58.247200       0   
6 2024-01-01 23:04:31  257038950   8.382945  58.247204       0   
7 2024-01-01 23:04:39  257038950   8.382947  58.247204       0   
8 2024-01-01 23:04:50  257038950   8.382947  58.247208       0   
9 2024-01-01 23:05:01  257038950   8.382947  58.247204       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  \
0               360.0                0.0          -128         0    0   
1               360.0                0.0          -128         0    0   
2               360.0                0.1          -128

In [142]:
print(gdf_read.columns)

Index(['date_time_utc', 'mmsi', 'longitude', 'latitude', 'status',
       'course_over_ground', 'speed_over_ground', 'rate_of_turn', 'maneuvre',
       'imo', 'callsign', 'ship_name', 'ship_type', 'length', 'draught',
       'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry'],
      dtype='object')


In [143]:
specific_lat = 40.0
specific_lon = -70.0
reference_point = Point(specific_lon, specific_lat)
closest_point = gdf_read.iloc[gdf_read.distance(reference_point).idxmin()]
print(closest_point)

m = folium.Map(locations=[closest_point.latitude, closest_point.longitude], zoom_start=10)
folium.Marker(
    [closest_point.latitude, closest_point.longitude],
    popup=f"Ship: {closest_point.ship_name}\nMMSI: {closest_point.mmsi}"
).add_to(m)
m

date_time_utc               2024-01-01 09:45:28
mmsi                                  352002289
longitude                              6.543423
latitude                               57.47652
status                                        0
course_over_ground                        262.1
speed_over_ground                          11.7
rate_of_turn                                  7
maneuvre                                      0
imo                                     9944144
callsign                                 3E3506
ship_name                          NORD VOLANTE
ship_type                                    89
length                                      183
draught                                   11.30
data_source                                   G
ais_class                                     A
hex_7                        608154189368918015
hex_14                       639679386753068359
geometry              POINT (6.543423 57.47652)
Name: 283571, dtype: object


# Partisjonering

In [144]:
# Leser geoparquet
geoparquet_file = pd.read_parquet('data/converted_parquet_file.geoparquet')

In [145]:
# Deler opp kolonnen date_time_utc til bare date og time.
geoparquet_file['date'] = geoparquet_file['date_time_utc'].dt.date

geoparquet_file['hour'] = geoparquet_file['date_time_utc'].dt.hour

In [146]:
print(geoparquet_file.head(10))

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 23:03:31  257038950   8.382945  58.247200       0   
1 2024-01-01 23:03:39  257038950   8.382945  58.247200       0   
2 2024-01-01 23:03:51  257038950   8.382945  58.247200       0   
3 2024-01-01 23:04:01  257038950   8.382943  58.247200       0   
4 2024-01-01 23:04:10  257038950   8.382940  58.247204       0   
5 2024-01-01 23:04:21  257038950   8.382943  58.247200       0   
6 2024-01-01 23:04:31  257038950   8.382945  58.247204       0   
7 2024-01-01 23:04:39  257038950   8.382947  58.247204       0   
8 2024-01-01 23:04:50  257038950   8.382947  58.247208       0   
9 2024-01-01 23:05:01  257038950   8.382947  58.247204       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  ...  \
0               360.0                0.0          -128         0    0  ...   
1               360.0                0.0          -128         0    0  ...   
2               360.0                0.

In [147]:
#conventerer speed_over_ground til string
# geoparquet_file['speed_over_ground_str'] = geoparquet_file['speed_over_ground'].astype(str)

In [148]:
# Konventerer til pyarrow tabell
table = pr.Table.from_pandas(geoparquet_file)

In [149]:
# Partisjonering her for kollonnene date, hour og skip_name
pq.write_to_dataset(
    table,
    root_path='data/utdata_mappe',
    partition_cols=['date', 'hour', 'ship_name']
)

# Filtrering

In [150]:
# Opprett et dataset-objekt som peker til den partisjonerte mappen.
dataset = ds.dataset("data/utdata_mappe", format="parquet", partitioning="hive")

In [151]:
# Printer ut alle navnene til skipene i datasettet
table_test = dataset.to_table(columns=["ship_name"])
df = table_test.to_pandas()
#unique_ship_names = df['ship_name'].unique()
#print(unique_ship_names)

In [152]:
filter_expr = (
    (ds.field("date") == "2024-01-01") &
    (ds.field("hour") == 12) &
    (ds.field("ship_name") == "ALAND")
)

In [153]:
filtered_table = dataset.to_table(filter=filter_expr)
#print(filtered_table)

In [154]:
df = filtered_table.to_pandas()
print(df.head())

        date_time_utc       mmsi  longitude   latitude  status  \
0 2024-01-01 12:00:02  249581000   9.050812  58.061813       0   
1 2024-01-01 12:00:05  249581000   9.050935  58.061870       0   
2 2024-01-01 12:00:09  249581000   9.051050  58.061913       0   
3 2024-01-01 12:00:15  249581000   9.051320  58.062040       0   
4 2024-01-01 12:00:32  249581000   9.051927  58.062310       0   

   course_over_ground  speed_over_ground  rate_of_turn  maneuvre      imo  \
0                50.9                5.5           -19         0  9487380   
1                49.1                6.0            13         0  9487380   
2                53.1                5.5            13         0  9487380   
3                48.5                5.7           -16         0  9487380   
4                49.1                5.4            20         0  9487380   

   ... length  draught  data_source ais_class               hex_7  \
0  ...    100     5.20            G         A  608155167698714623   
1 

# visualiser på kart

In [155]:
df['geometry'] = df['geometry'].apply(lambda x: wkb.loads(x) if isinstance(x, bytes) else x)

In [156]:
print(df.shape)

(527, 22)


In [157]:
# Henter ut alle punktene, er nok unødvendig
gdf = pg.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")
map_center = [gdf.geometry.y.mean(), gdf.geometry.x.mean()]
m = folium.Map(location=map_center, zoom_start=10)
#marker_cluster = MarkerCluster().add_to(m)

for idx, row in gdf.iterrows():
    folium.Marker(
        location=[row.geometry.y, row.geometry.x],
        popup=f"{row['ship_name']}<br>{row['date_time_utc']}"
    ).add_to(m)#.add_to(marker_cluster)

m

In [158]:
# Filtrere ut det første og siste punktet

m1 = folium.Map(location=map_center, zoom_start=10)
subset = gdf.iloc[[0, -1]]
for idx, row in subset.iterrows():
    folium.Marker(
        location=[row.geometry.y, row.geometry.x],
        popup=f"{row['ship_name']}<br>{row['date_time_utc']}"
    ).add_to(m1)

# Adder en linje fra startspunkt og sluttpunkt.
all_coords = [[row.geometry.y, row.geometry.x] for idx, row in gdf.iterrows()]
polyline = folium.PolyLine(
    locations=all_coords,
    color='blue',
    weight=5,
    opacity=0.8
).add_to(m1)

records = gdf.to_dict('records')
first_record = records[0]
last_record = records[-1]

# Beregn tidsdifferansen mellom det første og det siste punktet
time_diff = last_record['date_time_utc'] - first_record['date_time_utc']

polyline.add_child(folium.Popup(f"Tid brukt: {time_diff}", parse_html=True))
polyline.add_to(m1)

m1

In [159]:
# To do liste
# 1. Lage en egen fil for kunn konventering og partisjonering? (Dataeier?)
# 2. (Bruker) kunne filtrere etter ønsker? Bruke input?

# Optimalisering

In [160]:
'''
Dokumentert optimalisering
Hvor stor filen var i forhold til hvor stor den har blitt
Hastighet
Muligheten for bruker å laste ned filtrert fil
'''

'\nDokumentert optimalisering\nHvor stor filen var i forhold til hvor stor den har blitt\nHastighet\nMuligheten for bruker å laste ned filtrert fil\n'

In [161]:
# Funksjon som beregner størrelsen på en fil
def get_directory_size(directory):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    return total_size

In [162]:
# Bergener størrelsen på filen
original_size = get_directory_size("data/utdata_mappe")

In [163]:
start_time = time.time()
df_filtered = filtered_table.to_pandas()

In [164]:
# Lagre til en Parquet-fil på PC-en
output_path = "data/nedlastet_fil/filtered_output.parquet"
df_filtered.to_parquet(output_path)
end_time = time.time()

In [165]:
download_time = end_time - start_time

In [166]:
filtered_file_size = os.path.getsize(output_path)

In [167]:
download_speed = filtered_file_size / download_time

In [168]:
# 4. Skriv ut melding med informasjon
print(f"Original partisjonert data størrelse: {original_size} bytes")
print(f"Filtrert fil størrelse: {filtered_file_size} bytes")
print(f"Det tok {download_time:.2f} sekunder å laste ned/lagre den filtrerte filen.")
print(f"Nedlastingshastighet: {download_speed:.2f} bytes/sekund")

Original partisjonert data størrelse: 50727323 bytes
Filtrert fil størrelse: 42047 bytes
Det tok 0.02 sekunder å laste ned/lagre den filtrerte filen.
Nedlastingshastighet: 1735242.49 bytes/sekund
