# Filtrering av data

In [4]:
import geopandas as gpd

In [5]:
def convert_geoparquet(filepath):
    # Read the Parquet file
    df = gpd.read_parquet(filepath)
    
    # Ensure it's a GeoDataFrame
    if not isinstance(df, gpd.GeoDataFrame):
        df = gpd.GeoDataFrame(df, geometry="geometry")
    
    # Save as a GeoParquet file
    df.to_parquet("output.geoparquet", engine="pyarrow")
    return df
df_gpd = convert_geoparquet("parquet_files/hais_2024-01-01.snappy.parquet")

## Hente spesifikk kolumner

In [6]:
def check_col_exists(df, col):
    return col in df.columns

In [7]:
def filtering_col(df, cols):
    valid_cols = []
    for col in cols:
        if check_col_exists(df, col):
            valid_cols.append(col)
        else:
            print(f"The column {col} don't exists in the dataframe")
    new_df = df[valid_cols].copy()
    return new_df
    

In [8]:
filtering_df = filtering_col(df_gpd, ['ship_name', 'ship_name', 'length', 'draught', 'data_source'])

## Hente spesifikk rader
- spilte dataframe i mindre deler og så paralle søk så sett dem sammen

In [9]:
def filtering_rad(df, search_word):
    # Convert search word to lowercase for case-insensitive search
    search_word = str(search_word).lower()
    
    # Create a boolean mask where any column contains search_word
    matches = df.apply(lambda row: row.astype(str).str.lower().str.contains(search_word, na=False), axis=1).any(axis=1)

    # Check if there are any matches
    if not matches.any():
        print(f"'{search_word}' is not in the dataframe")
        return df.iloc[0:0]  # Return an empty DataFrame instead of None
    
    return df[matches]  # Return only matching rows

In [16]:
filtering_df = filtering_rad(df_gpd.head(300), 'OPTIMERA')
filtering_df

Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-01-01 23:03:31,257038950,8.382945,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
1,2024-01-01 23:03:39,257038950,8.382945,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
2,2024-01-01 23:03:51,257038950,8.382945,58.247200,0,360.0,0.1,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
3,2024-01-01 23:04:01,257038950,8.382943,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
4,2024-01-01 23:04:10,257038950,8.382940,58.247204,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2024-01-01 19:40:49,257038950,8.382938,58.247250,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382471,POINT (8.38294 58.24725)
296,2024-01-01 19:41:01,257038950,8.382940,58.247250,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382471,POINT (8.38294 58.24725)
297,2024-01-01 19:41:10,257038950,8.382938,58.247253,0,360.0,0.1,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382471,POINT (8.38294 58.24725)
298,2024-01-01 19:41:19,257038950,8.382936,58.247253,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382471,POINT (8.38294 58.24725)


In [14]:
df_gpd

Unnamed: 0,date_time_utc,mmsi,longitude,latitude,status,course_over_ground,speed_over_ground,rate_of_turn,maneuvre,imo,callsign,ship_name,ship_type,length,draught,data_source,ais_class,hex_7,hex_14,geometry
0,2024-01-01 23:03:31,257038950,8.382945,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
1,2024-01-01 23:03:39,257038950,8.382945,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
2,2024-01-01 23:03:51,257038950,8.382945,58.247200,0,360.0,0.1,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
3,2024-01-01 23:04:01,257038950,8.382943,58.247200,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
4,2024-01-01 23:04:10,257038950,8.382940,58.247204,0,360.0,0.0,-128,0,0,LF6173,OPTIMERA,70,15,0.00,G,A,608155174778699775,639680372160382879,POINT (8.38294 58.2472)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467836,2024-01-01 13:18:38,304819000,6.546460,57.849327,0,309.2,12.1,-127,0,9858424,V2HF7,STARNES,70,190,7.20,G,A,608154296541773823,639679493918304095,POINT (6.54646 57.84933)
467837,2024-01-01 13:18:44,304819000,6.545890,57.849580,0,310.4,12.2,0,0,9858424,V2HF7,STARNES,70,190,7.20,G,A,608154296541773823,639679493918294231,POINT (6.54589 57.84958)
467838,2024-01-01 13:18:52,304819000,6.545229,57.849850,0,307.7,12.2,0,0,9858424,V2HF7,STARNES,70,190,7.20,G,A,608154296541773823,639679493918314767,POINT (6.54523 57.84985)
467839,2024-01-01 13:19:03,304819000,6.544426,57.850197,0,307.3,12.1,0,0,9858424,V2HF7,STARNES,70,190,7.20,G,A,608154296541773823,639679493917722039,POINT (6.54443 57.8502)
