# R2R Navigation Data Processor

This notebook processes R2R NAV files containing ship navigation data. It can:
- Download R2R NAV files from provided URLs
- Parse time, latitude, and longitude data
- Calculate track lengths in kilometers, miles, and nautical miles
- Provide statistics for individual files and totals

In [None]:
import pandas as pd
from datetime import datetime
import requests
from pathlib import Path
import os
import math
from geopy.distance import geodesic
from tqdm.notebook import tqdm

In [None]:
def download_file(url, output_dir='data'):
    """Download a file from a URL and save it locally"""
    Path(output_dir).mkdir(exist_ok=True)
    filename = os.path.join(output_dir, url.split('/')[-1])
    
    if os.path.exists(filename):
        print(f"File {filename} already exists, skipping download")
        return filename
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    return filename

In [None]:
def read_r2rnav(filepath):
    """Read an R2R NAV file into a pandas DataFrame with validation"""
    data = []
    invalid_lines = []
    line_number = 0

    def is_valid_datetime(dt_str):
        try:
            # Check if string follows ISO8601 format and can be parsed
            pd.to_datetime(dt_str)
            return True
        except:
            return False

    def is_valid_longitude(lon_str):
        try:
            lon = float(lon_str)
            return -180 <= lon <= 180
        except:
            return False

    def is_valid_latitude(lat_str):
        try:
            lat = float(lat_str)
            return -90 <= lat <= 90
        except:
            return False

    with open(filepath, 'r') as f:
        for line in f:
            line_number += 1
            if not line.startswith('//') and not line.startswith('#'):
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    dt, lon, lat = parts[:3]

                    # Validate each field
                    if not is_valid_datetime(dt):
                        invalid_lines.append(f"Line {line_number}: Invalid datetime format: {dt}")
                        continue
                    if not is_valid_longitude(lon):
                        invalid_lines.append(f"Line {line_number}, {dt}: Invalid longitude value: {lon}")
                        continue
                    if not is_valid_latitude(lat):
                        invalid_lines.append(f"Line {line_number}, {dt}: : Invalid latitude value: {lat}")
                        continue

                    data.append(parts[:3])

    # If any invalid lines were found, print warnings
    if invalid_lines:
        print(f"\nWarnings while reading {os.path.basename(filepath)}:")
        for warning in invalid_lines:
            print(warning)
        print()

    if not data:
        raise ValueError(f"No valid data found in {filepath}")

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=['datetime', 'longitude', 'latitude'])

    # Convert types
    # df['datetime'] = pd.to_datetime(df['datetime']).dt.floor('min')
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['longitude'] = pd.to_numeric(df['longitude'])
    df['latitude'] = pd.to_numeric(df['latitude'])
    
    return df

In [None]:
def calculate_track_length(df):
    """Calculate the total track length from a DataFrame of coordinates"""
    total_distance_km = 0
    
    for i in range(len(df) - 1):
        point1 = (df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        point2 = (df.iloc[i + 1]['latitude'], df.iloc[i + 1]['longitude'])
        distance = geodesic(point1, point2).kilometers
        total_distance_km += distance
    
    # Convert to different units
    total_distance_mi = total_distance_km * 0.621371
    total_distance_nm = total_distance_km * 0.539957
    
    return {
        'kilometers': total_distance_km,
        'miles': total_distance_mi,
        'nautical_miles': total_distance_nm
    }

In [None]:
# TODO: also return start/end times and elapsed time
#  nav data *should* usually cover at least the portion of the cruise timeframe from leaving port to returning.
#  will have to think about how to trim sitting in port vs sitting on station with rov dive.
#  maybe straightforward enough?

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate distance using Haversine formula (great-circle distance).
    Assumes spherical Earth.

    Returns distance in kilometers.
    """
    EARTH_RADIUS_KM = 6371.0
    
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.asin(math.sqrt(a))
    
    return EARTH_RADIUS_KM * c

def filter_by_min_distance(df, min_distance_m=10.0):
    """
    Keep only points that have moved at least min_distance from previous kept point.
    Useful for removing GPS jitter while stationary.

    Args:
        df: DataFrame with lat/lon columns
        min_distance_m: Minimum distance in meters between consecutive points

    Returns:
        Filtered DataFrame
    """
    if len(df) <= 1:
        return df

    kept_indices = [0]  # Always keep first point
    last_kept = 0

    for i in range(1, len(df)):
        # Provide coordinates in correct order for haversine (lat1, lon1, lat2, lon2)
        dist = (
            haversine_distance(
                df.iloc[last_kept]["latitude"],
                df.iloc[last_kept]["longitude"],
                df.iloc[i]["latitude"],
                df.iloc[i]["longitude"],
            )
            * 1000
        )  # Convert to meters

        if dist >= min_distance_m:
            kept_indices.append(i)
            last_kept = i

    return df.iloc[kept_indices].copy()

In [None]:
def process_single_r2rnav_file(filepath):
    """Process a single R2R NAV file and return stats and point count"""
    df = read_r2rnav(filepath)
    
    # Apply minimum distance filter for 1-minute files
    # Using 7.5m (approx 0.25 knots) to filter out GPS jitter while stationary
    if '1min' in os.path.basename(filepath):
        df = filter_by_min_distance(df, min_distance_m=7.5)
        
    stats = calculate_track_length(df)
    
    if not df.empty:
        start_time = df['datetime'].min()
        end_time = df['datetime'].max()
        elapsed_hours = (end_time - start_time).total_seconds() / 3600
        stats['elapsed_hours'] = round(elapsed_hours, 1)
        stats['start_time'] = start_time
        stats['end_time'] = end_time
        
    return stats, len(df)

def download_and_process_r2rnav(url, cruise_id, output_dir='data-local'):
    """Download and process an R2R NAV file"""
    try:
        filename = download_file(url, output_dir)
        return process_single_r2rnav_file(filename)
    except Exception as e:
        print(f"Error processing {cruise_id}: {e}")
        return None, 0

def read_simple_csv_nav(filepath):
    """Read a simple CSV nav file (stub)"""
    # TODO: Implement reading of simple CSV format
    # Expected format: timestamp, latitude, longitude
    try:
        # df = pd.read_csv(filepath)
        # return df
        pass
    except Exception as e:
        print(f"Error reading CSV {filepath}: {e}")

    return pd.DataFrame()

def process_files(file_paths):
    """Process multiple R2R NAV files and return statistics"""
    results = []
    total_stats = {'kilometers': 0, 'miles': 0, 'nautical_miles': 0}

    iterator = tqdm(file_paths, desc="Processing files", unit="file")

    for file_path in iterator:
        iterator.set_description(f"Processing {os.path.basename(file_path)}")

        df = read_r2rnav(file_path)
        
        # Apply minimum distance filter for 1-minute files
        # Using 7.5m (approx 0.25 knots) to filter out GPS jitter while stationary
        if '1min' in os.path.basename(file_path):
            df = filter_by_min_distance(df, min_distance_m=7.5)
            
        stats = calculate_track_length(df)
        
        start_time = df['datetime'].min()
        end_time = df['datetime'].max()
        elapsed_hours = (end_time - start_time).total_seconds() / 3600

        results.append({
            'file': os.path.basename(file_path),
            'start_time': start_time.strftime('%Y-%m-%dT%H:%M'),
            'end_time': end_time.strftime('%Y-%m-%dT%H:%M'),
            'elapsed_hours': round(elapsed_hours, 1),
            'points': len(df),
            **stats
        })
        
        # Update totals
        for key in total_stats:
            total_stats[key] += stats[key]
    
    return pd.DataFrame(results), total_stats

## Example: Process Local Files

You can process local R2R NAV files by providing their paths:

In [None]:
# Example with local files

# use this for local files
#local_files = [f for f in os.listdir() if f.endswith('.r2rnav')]

# use this for test files
#local_files = [os.path.join('tests', f) for f in os.listdir('tests') if f.endswith('.r2rnav')]

# NOTE: setting to empty so we don't process any files
local_files = []
if local_files:
    results_df, totals = process_files(local_files)

    print("\nResults per file:")
    display(results_df)

    print("\nTotal distances:")
    for unit, distance in totals.items():
        # Print current time before processing next unit
        print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        print(f"Total distance in {unit}: {distance:.2f}")


## Example: Download and Process Files from URLs

You can also download and process files from URLs:

In [None]:
# Example with URLs
urls = [
    # Add your URLs here
    # "http://example.com/file1.r2rnav",
    # "http://example.com/file2.r2rnav"
]

if urls:
    # Download files
    downloaded_files = []
    for url in tqdm(urls, desc="Downloading files"):
        try:
            filename = download_file(url)
            downloaded_files.append(filename)
        except Exception as e:
            print(f"Error downloading {url}: {e}")

    # Process downloaded files
    if downloaded_files:
        results_df, totals = process_files(downloaded_files)
        
        print("\nResults per file:")
        display(results_df)
        
        print("\nTotal distances:")
        for unit, distance in totals.items():
            print(f"Total distance in {unit}: {distance:.2f}")