In [None]:
import os
from pathlib import Path
from math import radians, sin, cos, sqrt, atan2

import pandas as pd
import numpy as np

In [8]:
DIR_DATA = Path(os.path.dirname(os.path.abspath(''))).resolve() / "data"
DIR_SOURCE = DIR_DATA / "raw"
DIR_OUTPUT = DIR_DATA / "processed" / "TOPP"
DIR_OUTPUT.mkdir(parents=True, exist_ok=True)
FILE_SHARKS = "TRACKING-OF-PELAGIC-PREDATORS"
print(f"Datasets directory:\n{DIR_DATA}")
print(f"Found sharks data? {Path(DIR_SOURCE / FILE_SHARKS).is_dir()}")

Datasets directory:
/home/isekar/Documents/projects/NASA_SpaceApps_2025/Project/data
Found sharks data? True


In [None]:
def haversine_distance(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    r = 6371000 
    return c * r

def calculate_bearing(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dLon = lon2 - lon1
    
    x = sin(dLon) * cos(lat2)
    y = cos(lat1) * sin(lat2) - sin(lat1) * cos(lat2) * cos(dLon)
    
    initial_bearing = atan2(x, y)
    
    initial_bearing = np.degrees(initial_bearing)
    compass_bearing = (initial_bearing + 360) % 360
    return compass_bearing

In [10]:
white = pd.read_csv(DIR_SOURCE / FILE_SHARKS / "GreatWhite.csv")
salmon = pd.read_csv(DIR_SOURCE / FILE_SHARKS / "SalmonShark.csv")

white["Species"] = "GreatWhite"
salmon["Species"] = "Salmon"

sharks = pd.merge(left=white, right=salmon, how='outer').drop(
    columns=["Unnamed: 0", 
             "Satellite SST", 
             "Observed SST", 
             "Observed Depth", 
             "Bathymetry Depth"]).sort_values(by='Date')


print(sharks.isna().sum())

print(sharks)

Date         0
Latitude     0
Longitude    0
SharkID      0
Species      0
dtype: int64
                        Date  Latitude  Longitude  SharkID     Species
84902   01-Apr-2001 00:00:00    22.100   -152.750  1900004  GreatWhite
84967   01-Apr-2001 12:00:00    22.200   -152.725  1900004  GreatWhite
100029  01-Apr-2004 00:00:00    45.175   -129.425  1703006      Salmon
45829   01-Apr-2004 00:00:00    38.100   -131.875  1903009  GreatWhite
44241   01-Apr-2004 00:00:00    35.200   -152.525  1903007  GreatWhite
...                      ...       ...        ...      ...         ...
108801  31-Oct-2012 08:00:00    45.000   -124.975  1911038  GreatWhite
92546   31-Oct-2012 12:00:00    25.125   -154.600  1911053  GreatWhite
101734  31-Oct-2012 12:00:00    22.100   -136.125  1911027  GreatWhite
108803  31-Oct-2012 12:00:00    45.250   -125.125  1911038  GreatWhite
108805  31-Oct-2012 14:28:00    45.400   -125.225  1911038  GreatWhite

[109136 rows x 5 columns]


In [11]:
sharks = sharks.reset_index().drop_duplicates(
    subset=["Species", "SharkID", "Date"]
    ).set_index(
        ["Species", "SharkID", "Date"]
        ).drop(
            columns=['index']
            ).sort_values(
                ['Species', 'SharkID', 'Date'])

sharks['prev_lat'] = sharks.groupby('SharkID')['Latitude'].shift(1)
sharks['prev_lon'] = sharks.groupby('SharkID')['Longitude'].shift(1)

mask = sharks['prev_lat'].notna()
sharks.loc[mask, 'distance_traveled'] = sharks[mask].apply(
    lambda row: haversine_distance(
        row['prev_lon'], 
        row['prev_lat'], 
        row['Longitude'], 
        row['Latitude']),
    axis=1
)

sharks.loc[mask, 'orientation'] = sharks[mask].apply(
    lambda row: calculate_bearing(
        row['prev_lon'], 
        row['prev_lat'], 
        row['Longitude'], 
        row['Latitude']),
    axis=1
)

sharks = sharks.dropna(subset=['distance_traveled', 'orientation']).copy()
sharks['x'] = sharks['orientation'].apply(sin)
sharks['y'] = sharks['orientation'].apply(cos)

sharks = sharks.drop(columns=["Longitude", "Latitude", "orientation"])

sharks = sharks.rename(
    columns={"prev_lat": "lat", 
            "prev_lon": "lon"})

sharks = sharks.rename_axis(["Species", "SharkID", "time"])

sharks = sharks.set_index('lon', append=True)
sharks = sharks.set_index('lat', append=True)

sharks = sharks.sort_values(
    ['Species', 'SharkID', "time", "lon", "lat"])

sharks = sharks.astype(np.float32)

print(sharks)

print(sharks.count())

                                                         distance_traveled  \
Species    SharkID time                 lon      lat                         
GreatWhite 1900004 01-Apr-2001 12:00:00 -152.750 22.100       1.141369e+04   
                   01-Dec-2000 00:00:00 -152.725 22.200       2.516905e+06   
                   01-Dec-2000 01:36:50 -130.300 33.225       1.087473e+04   
                   01-Dec-2000 06:00:00 -130.225 33.150       3.465991e+04   
                   01-Dec-2000 08:00:00 -130.050 32.875       1.815638e+04   
...                                                                    ...   
Salmon     1707017 31-Oct-2007 00:00:00 -146.425 59.375       8.168946e+04   
                   31-Oct-2007 02:02:48 -145.850 60.050       0.000000e+00   
                   31-Oct-2007 02:59:48 -145.850 60.050       0.000000e+00   
                   31-Oct-2007 12:00:00 -145.850 60.050       0.000000e+00   
                   31-Oct-2007 15:15:48 -145.850 60.050       1.

In [12]:
sharks = sharks.to_hdf(DIR_OUTPUT / "SHARKS.h5", key='sharks_df', mode='w')