Deler filen opp i batches og strømmer en og en batch i 2 sekunder om gangen

In [3]:
import duckdb
import time

# Oppretter DuckDB-tilkobling
con = duckdb.connect()

# Leser fra parquet-filen
query = """
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
"""

# Definer batch-størrelse
batch_size = 10000
offset = 0

while True:
    # Bygg spørring for neste del av data
    batch_query = f"{query} LIMIT {batch_size} OFFSET {offset}"
    batch = con.execute(batch_query).fetchdf()
    
    # Sjekker om den har nådd slutten
    if len(batch) == 0:
        print("Ingen flere data. Strømming avsluttes.")
        break
    
    # Viser info om batchen for å vise at data “kommer inn”
    print(f"\nStreamer batch fra offset {offset}:")
    # viserkun de første 5 kolonnene
    print(batch.iloc[:, :5].head())
    
    # For å simulere datastrømming venter den i 2 sek
    time.sleep(2)
    
    # Neste offset
    offset += batch_size

# Lukk tilkoblingen
con.close()


Streamer batch fra offset 0:
        date_time_utc       mmsi  longitude   latitude  status
0 2024-12-01 05:58:44  257565700   7.476903  58.020340       5
1 2024-12-01 05:30:48  257565700   7.476915  58.020353       5
2 2024-12-01 05:16:51  257565700   7.476912  58.020327       5
3 2024-12-01 05:12:51  257565700   7.476917  58.020340       5
4 2024-12-01 05:06:46  257565700   7.476915  58.020327       5

Streamer batch fra offset 10000:
        date_time_utc       mmsi  longitude   latitude  status
0 2024-12-01 18:26:39  258012180   8.034753  58.146902      15
1 2024-12-01 18:26:28  258012180   8.034750  58.146897      15
2 2024-12-01 18:26:18  258012180   8.034743  58.146895      15
3 2024-12-01 18:26:09  258012180   8.034735  58.146895      15
4 2024-12-01 18:25:58  258012180   8.034722  58.146900      15

Streamer batch fra offset 20000:
        date_time_utc       mmsi  longitude   latitude  status
0 2024-12-01 22:36:21  258258500   8.778727  58.461798       0
1 2024-12-01 22:36:1

Sjekker om filen inneholder en geometry kolonne

In [9]:
import pandas as pd

def check_geometry_in_parquet(file_path):
    try:
        # Les Parquet-filen inn i en Pandas DataFrame
        df = pd.read_parquet(file_path)

        # Skriv ut kolonnenavnene for å se hva som er tilgjengelig
        print("Kolonner i filen:", df.columns.tolist())

        # Sjekk om 'geometry' er en av kolonnene
        if 'geometry' in df.columns:
            print("Filen inneholder en 'geometry'-kolonne.")
        else:
            print("Filen inneholder IKKE en 'geometry'-kolonne.")
    except Exception as e:
        print("Noe gikk galt ved lesing av filen:", e)

if __name__ == "__main__":
    # Sett filbanen til din Parquet-fil
    file_path = "/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet"  # Endre til riktig bane
    check_geometry_in_parquet(file_path)


Kolonner i filen: ['date_time_utc', 'mmsi', 'longitude', 'latitude', 'status', 'course_over_ground', 'speed_over_ground', 'rate_of_turn', 'maneuvre', 'imo', 'callsign', 'ship_name', 'ship_type', 'length', 'draught', 'data_source', 'ais_class', 'hex_7', 'hex_14', 'geometry']
Filen inneholder en 'geometry'-kolonne.


Leser filen sine 5 første rader

In [14]:
import geopandas as gpd
import pyarrow.parquet as pq

# Lese Parquet-fil
gdf = gpd.read_parquet("/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet")
df.head()



Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-12-01 05:58:44,257565700,7.476903,58.02034,5,215.0,0.0,-9.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910703,b'\x01\x01\x00\x00\x00a\xd0\xb6XY\xe8\x1d@{fI\...
1,2024-12-01 05:30:48,257565700,7.476915,58.020353,5,3.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910327,b'\x01\x01\x00\x00\x00*\xe3\xdfg\\\xe8\x1d@{[\...
2,2024-12-01 05:16:51,257565700,7.476912,58.020327,5,55.2,0.1,15.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,b'\x01\x01\x00\x00\x00\xd1\x1f\xbb\x87[\xe8\x1...
3,2024-12-01 05:12:51,257565700,7.476917,58.02034,5,134.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910671,b'\x01\x01\x00\x00\x00_xF\xd7\\\xe8\x1d@{fI\x8...
4,2024-12-01 05:06:46,257565700,7.476915,58.020327,5,26.6,0.0,10.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,b'\x01\x01\x00\x00\x00*\xe3\xdfg\\\xe8\x1d@\xe...


Setter en limit til kun 100 rader, den henter dermet kun de første 100 radene i filen og setter det i et kart


In [19]:
import duckdb
import pandas as pd
import folium
from IPython.display import display

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Leser fra parquet-filen
query = """
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
"""

# Definer batch-størrelse til 100 rader
batch_size = 100
offset = 0

# Bygg spørring for å hente 100 rader (én batch)
batch_query = f"{query} LIMIT {batch_size} OFFSET {offset}"
batch = con.execute(batch_query).fetchdf()

# Sjekk om vi har fått noen data
if len(batch) == 0:
    print("Ingen data funnet.")
else:
    print("Streamer 100 rader:")
    display(batch.head())  # Viser de første radene i en tabell

    # Hvis filen inneholder geodata, for eksempel med 'latitude' og 'longitude'-kolonner,
    # kan vi lage et enkelt Folium-kart:
    if "latitude" in batch.columns and "longitude" in batch.columns:
        avg_lat = batch["latitude"].mean()
        avg_lon = batch["longitude"].mean()
        m = folium.Map(location=[avg_lat, avg_lon], zoom_start=7)

        for _, row in batch.iterrows():
            lat = row["latitude"]
            lon = row["longitude"]
            if not pd.isna(lat) and not pd.isna(lon):
                folium.Marker(
                    location=[lat, lon],
                    popup=str(row.iloc[:5].to_dict())
                ).add_to(m)

        display(m)
    else:
        print("Ingen 'latitude'/'longitude'-kolonner funnet.")

# Lukk tilkoblingen
con.close()


Streamer 100 rader:


Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-12-01 05:58:44,257565700,7.476903,58.02034,5,215.0,0.0,-9.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910703,"[1, 1, 0, 0, 0, 97, 208, 182, 88, 89, 232, 29,..."
1,2024-12-01 05:30:48,257565700,7.476915,58.020353,5,3.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910327,"[1, 1, 0, 0, 0, 42, 227, 223, 103, 92, 232, 29..."
2,2024-12-01 05:16:51,257565700,7.476912,58.020327,5,55.2,0.1,15.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,"[1, 1, 0, 0, 0, 209, 31, 187, 135, 91, 232, 29..."
3,2024-12-01 05:12:51,257565700,7.476917,58.02034,5,134.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910671,"[1, 1, 0, 0, 0, 95, 120, 70, 215, 92, 232, 29,..."
4,2024-12-01 05:06:46,257565700,7.476915,58.020327,5,26.6,0.0,10.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,"[1, 1, 0, 0, 0, 42, 227, 223, 103, 92, 232, 29..."


In [23]:
import duckdb
import pandas as pd
from IPython.display import display

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Definer tidsintervallet
start_time = "2024-12-01 05:12:51"
end_time = "2024-12-01 05:30:48"

# SQL-spørring for å filtrere data basert på tidspunkt
query = f"""
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
 WHERE CAST(date_time_utc AS TIMESTAMP) BETWEEN '2024-12-01 05:12:51' AND '2024-12-01 05:30:48'

"""

# Utfør spørringen og hent dataen
filtered_data = con.execute(query).fetchdf()

# Sjekk om vi fikk noen rader
if filtered_data.empty:
    print("Ingen data funnet i det spesifiserte tidsintervallet.")
else:
    print("Filtrerte data:")
    display(filtered_data.head())  # Viser de første radene i en tabell

# Lukk tilkoblingen
con.close()


Filtrerte data:


Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-12-01 05:30:48,257565700,7.476915,58.020353,5,3.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910327,"[1, 1, 0, 0, 0, 42, 227, 223, 103, 92, 232, 29..."
1,2024-12-01 05:16:51,257565700,7.476912,58.020327,5,55.2,0.1,15.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,"[1, 1, 0, 0, 0, 209, 31, 187, 135, 91, 232, 29..."
2,2024-12-01 05:12:51,257565700,7.476917,58.02034,5,134.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910671,"[1, 1, 0, 0, 0, 95, 120, 70, 215, 92, 232, 29,..."
3,2024-12-01 05:29:45,257004200,8.385035,58.246423,5,360.0,0.1,-128.0,0.0,0,LG9963,RC LILLESAND,51,12,0.0,G,A,608155174829031423,639680372217695079,"[1, 1, 0, 0, 0, 119, 161, 185, 78, 35, 197, 32..."
4,2024-12-01 05:26:46,257004200,8.385097,58.24643,5,360.0,0.1,-128.0,0.0,0,LG9963,RC LILLESAND,51,12,0.0,G,A,608155174829031423,639680372217695071,"[1, 1, 0, 0, 0, 103, 255, 176, 99, 43, 197, 32..."


In [26]:
import duckdb
import pandas as pd
from IPython.display import display

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Definer tidsintervallet
start_time = "2024-12-01 05:14:51"
end_time = "2024-12-01 05:30:48"

# SQL-spørring for å filtrere data basert på tidspunkt
query = f"""
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
    WHERE CAST(date_time_utc AS TIMESTAMP)
    BETWEEN TIMESTAMP '{start_time}' AND TIMESTAMP '{end_time}'
"""

# Utfør spørringen og hent dataen
filtered_data = con.execute(query).fetchdf()

# Sjekk om vi fikk noen rader
if filtered_data.empty:
    print("Ingen data funnet i det spesifiserte tidsintervallet.")
else:
    print(f"Fant {len(filtered_data)} rader i tidsintervallet:")
    display(filtered_data)  # Viser hele den filtrerte tabellen

# Lukk tilkoblingen
con.close()


Fant 211 rader i tidsintervallet:


Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-12-01 05:30:48,257565700,7.476915,58.020353,5,3.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910327,"[1, 1, 0, 0, 0, 42, 227, 223, 103, 92, 232, 29..."
1,2024-12-01 05:16:51,257565700,7.476912,58.020327,5,55.2,0.1,15.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910663,"[1, 1, 0, 0, 0, 209, 31, 187, 135, 91, 232, 29..."
2,2024-12-01 05:29:45,257004200,8.385035,58.246423,5,360.0,0.1,-128.0,0.0,0,LG9963,RC LILLESAND,51,12,0.0,G,A,608155174829031423,639680372217695079,"[1, 1, 0, 0, 0, 119, 161, 185, 78, 35, 197, 32..."
3,2024-12-01 05:26:46,257004200,8.385097,58.246430,5,360.0,0.1,-128.0,0.0,0,LG9963,RC LILLESAND,51,12,0.0,G,A,608155174829031423,639680372217695071,"[1, 1, 0, 0, 0, 103, 255, 176, 99, 43, 197, 32..."
4,2024-12-01 05:23:45,257004200,8.385032,58.246440,5,360.0,0.1,-128.0,0.0,0,LG9963,RC LILLESAND,51,12,0.0,G,A,608155174829031423,639680372217695087,"[1, 1, 0, 0, 0, 202, 63, 167, 222, 34, 197, 32..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,2024-12-01 05:15:40,258258500,8.778707,58.461893,0,229.6,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 57, 193, 171, 163, 178, 142, 3..."
207,2024-12-01 05:15:30,258258500,8.778708,58.461892,0,227.6,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 16, 242, 180, 219, 178, 142, 3..."
208,2024-12-01 05:15:20,258258500,8.778707,58.461890,0,226.5,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 57, 193, 171, 163, 178, 142, 3..."
209,2024-12-01 05:15:10,258258500,8.778708,58.461890,0,226.6,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 16, 242, 180, 219, 178, 142, 3..."


Filrerer filen basert på tidsstempler og viser de filtrerte dataene i et kart

In [47]:
import duckdb
import pandas as pd
import folium
from IPython.display import display

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Definer tidsintervallet
start_time = "2024-12-01 05:12:51"
end_time = "2024-12-01 05:14:00"

# SQL-spørring for å filtrere data basert på tidspunkt
query = f"""
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
    WHERE CAST(date_time_utc AS TIMESTAMP)
    BETWEEN TIMESTAMP '{start_time}' AND TIMESTAMP '{end_time}'
"""

# Utfør spørringen og hent dataen
filtered_data = con.execute(query).fetchdf()

# Lukk tilkoblingen
con.close()


# Sjekk om vi fikk noen rader
if filtered_data.empty:
    print("Ingen data funnet i det spesifiserte tidsintervallet.")
else:
    print(f"Fant {len(filtered_data)} rader i tidsintervallet:")
    display(filtered_data)  # Viser hele den filtrerte tabellen

    # Sjekk om datasettet inneholder 'latitude' og 'longitude'
    if "latitude" in filtered_data.columns and "longitude" in filtered_data.columns:
        avg_lat = filtered_data["latitude"].mean()
        avg_lon = filtered_data["longitude"].mean()

        # Lag et Folium-kart med gjennomsnittlig senterpunkt
        m = folium.Map(location=[avg_lat, avg_lon], zoom_start=10)

        # Legg til punktene i kartet
        for _, row in filtered_data.iterrows():
            lat, lon = row["latitude"], row["longitude"]
            if not pd.isna(lat) and not pd.isna(lon):  # Sjekk at verdiene ikke er NaN
                ship_name = row.get("ship_name", "Ukjent skip")  # Erstatt med riktig kolonnenavn for skipets navn
                popup_text = f"Timestamp: {row['date_time_utc']}<br>Ship: {ship_name}"

                folium.Marker(
                    location=[lat, lon],
                    popup=popup_text,
                ).add_to(m)

        # Vis kartet
        display(m)
    else:
        print("Ingen 'latitude'/'longitude'-kolonner funnet i datasettet.")


Fant 15 rader i tidsintervallet:


Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-12-01 05:12:51,257565700,7.476917,58.02034,5,134.2,0.0,16.0,0.0,0,LF5484,RESCUE 157 BILL,51,14,0.0,G,A,608154275335372799,639679472713910671,"[1, 1, 0, 0, 0, 95, 120, 70, 215, 92, 232, 29,..."
1,2024-12-01 05:13:58,258012180,8.034652,58.146895,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155664551,"[1, 1, 0, 0, 0, 123, 231, 196, 220, 189, 17, 3..."
2,2024-12-01 05:13:47,258012180,8.034647,58.146887,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155664783,"[1, 1, 0, 0, 0, 52, 59, 255, 52, 189, 17, 32, ..."
3,2024-12-01 05:13:38,258012180,8.034633,58.14686,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155664791,"[1, 1, 0, 0, 0, 53, 103, 183, 117, 187, 17, 32..."
4,2024-12-01 05:13:28,258012180,8.03462,58.14684,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155662183,"[1, 1, 0, 0, 0, 54, 147, 111, 182, 185, 17, 32..."
5,2024-12-01 05:13:18,258012180,8.034605,58.146835,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155662183,"[1, 1, 0, 0, 0, 97, 142, 30, 191, 183, 17, 32,..."
6,2024-12-01 05:13:08,258012180,8.034602,58.146838,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155662183,"[1, 1, 0, 0, 0, 180, 44, 12, 79, 183, 17, 32, ..."
7,2024-12-01 05:12:58,258012180,8.034608,58.146845,15,360.0,0.0,-128.0,0.0,0,LF7245,SAR NICOLAI JARLSBY,51,9,0.6,G,A,608155171775578111,639680369155662183,"[1, 1, 0, 0, 0, 209, 9, 219, 46, 184, 17, 32, ..."
8,2024-12-01 05:14:00,258258500,8.778695,58.461892,0,221.7,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 17, 30, 109, 28, 177, 142, 33,..."
9,2024-12-01 05:13:50,258258500,8.778688,58.461893,0,221.4,0.0,0.0,0.0,0,LDLN,RESCUE STORMBULL,51,17,1.0,G,A,608155183335079935,639680380713776863,"[1, 1, 0, 0, 0, 243, 64, 158, 60, 176, 142, 33..."


Partisjonering

In [57]:
import duckdb
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Les inn data fra en Parquet-fil
query = """
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
"""
data = con.execute(query).fetchdf()

# Lukk tilkoblingen
con.close()

# Konverter Pandas DataFrame til PyArrow Table
table = pa.Table.from_pandas(data)

# Definer output path
output_path = '/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/PartisjonertData'

# Skriv til partisjonerte Parquet-filer
pq.write_to_dataset(
    table,
    root_path=output_path,
    partition_cols=['ship_name']  # Partisjonering basert på 'ship_name'
)

print(f"Data er skrevet til {output_path} i partisjonerte Parquet-filer.")


Data er skrevet til /Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/PartisjonertData i partisjonerte Parquet-filer.


Får lest fil, men geometri er i binær, må konvertere før partisjonering

In [60]:
import pyarrow.parquet as pq

# Definer filstien til den spesifikke partisjonen
partition_path = '/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/PartisjonertData/ship_name=RC%20LILLESAND'

# Les dataene fra partisjonen som en PyArrow Table
table = pq.read_table(partition_path)
display
# Konverter PyArrow Table til Pandas DataFrame for videre behandling
df = table.to_pandas()

# Vis DataFrame
print(df)

          date_time_utc       mmsi  longitude   latitude  status  \
0   2024-12-01 05:59:48  257004200   8.384973  58.246438       5   
1   2024-12-01 05:56:48  257004200   8.384982  58.246453       5   
2   2024-12-01 05:53:46  257004200   8.385055  58.246447       5   
3   2024-12-01 05:50:46  257004200   8.385032  58.246472       5   
4   2024-12-01 05:47:45  257004200   8.384997  58.246442       5   
..                  ...        ...        ...        ...     ...   
461 2024-12-01 18:42:15  257004200   8.385000  58.245000       5   
462 2024-12-01 18:39:15  257004200   8.385000  58.245000       5   
463 2024-12-01 18:39:15  257004200   8.385000  58.245000       5   
464 2024-12-01 18:36:15  257004200   8.383333  58.245000       5   
465 2024-12-01 18:36:15  257004200   8.383333  58.245000       5   

     course_over_ground  speed_over_ground  rate_of_turn  maneuvre  imo  \
0                 360.0                0.1        -128.0       0.0    0   
1                 360.0          

Prøver å konvertere fra binær

In [61]:
import duckdb
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import geopandas as gpd
from shapely import wkb

# Opprett DuckDB-tilkobling
con = duckdb.connect()

# Les inn data fra Parquet-fil (som inneholder geometrien i binær format)
query = """
    SELECT *
    FROM read_parquet('/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/hais_2024-12-01.snappy.parquet.geo.parquet')
"""
data = con.execute(query).fetchdf()

# Lukk tilkoblingen
con.close()

# Konverter Pandas DataFrame til GeoPandas DataFrame for å håndtere geometrien
# Anta at geometrien er lagret i en kolonne som heter 'geometry' som binære WKB-data
gdf = gpd.GeoDataFrame(data, geometry=gpd.GeoSeries.from_wkb(data['geometry']))

# Konverter geometrien fra binær WKB til WKT (tekstformat)
gdf['geometry'] = gdf['geometry'].apply(lambda x: x.wkt if x is not None else None)

# Nå kan du konvertere GeoDataFrame til PyArrow Table
table = pa.Table.from_pandas(gdf)

# Definer output path
output_path = '/Users/johannehaakenstad/Bachelor-Filer/Github-KartAI/Johanne/data/PartisjonertData'

# Skriv til partisjonerte Parquet-filer uten binær geometri
pq.write_to_dataset(
    table,
    root_path=output_path,
    partition_cols=['ship_name']  # Partisjonering basert på 'ship_name'
)

print(f"Data er skrevet til {output_path} i partisjonerte Parquet-filer uten binær geometri.")


TypeError: Expected bytes or string, got bytearray