# Ocean Data Processing

In [1]:
#IMPORT REQUIRED MODULES
import os
import xarray as xr
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import numpy as np
import json
import math
import gsw # Gibbs-SeaWater (GSW) Oceanographic Toolbox 
import csv

## OMG ALAMO Floats (F9250 & F9313)

The following code processes the data recorded by two OMG ALAMO floats positioned within Disko Bay, F9250 & F9313 (https://podaac.jpl.nasa.gov/dataset/OMG_L1_FLOAT_ALAMO). The data are provided as JSON files - to begin, the number of JSON files provided within each directory is printed. The JSON files are then processed in order to sample oceanic variables at a specified depth. For the purposes of this study, oceanic variables were sampled at a depth of 240M within Disko Bay. Variables such as temperature, pressure and depth are extracted, with the date, longitude and latitude of each dive also recorded from the 'DiveStart' metadata. It should be noted that depth is not measured directly and is therefore calculated from pressure, using a package from the Gibbs-SeaWater (GSW) Oceanographic Toolbox (https://www.teos-10.org/software.htm#1). 

In [4]:
# Define a function to calculate the number of JSON files within a specified directory. 
def count_json_files(directory):
    file_count = 0
    for (dir_path, _, file_names) in os.walk(os.path.normpath(directory)):
        for file in file_names:
            if file.endswith(".json"):
                file_count += 1
    return file_count

ALAMO_F9250_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_alamo_f9250/'
ALAMO_F9313_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_alamo_f9313/'
print(f"Number of JSON files in ALAMO_F9250 directory: {count_json_files(ALAMO_F9250_directory)}")
print(f"Number of JSON files in ALAMO_F9313 directory: {count_json_files(ALAMO_F9313_directory)}")

Number of JSON files in ALAMO_F9250 directory: 90
Number of JSON files in ALAMO_F9313 directory: 94


In [5]:
# Specify the input directory, desired sampling depth and output filename.
directory = ALAMO_F9250_directory
desired_depth = 240
output_filename = 'ALAMO_F9250_240m.csv'

# Initialize an empty DataFrame to store the combined data.
combined_df = pd.DataFrame(columns=['date', 'lon', 'lat','temperature', 'salinity', 'pressure', 'depth'])

# Within the specified directory, open each JSON file in read mode. 
for filename in os.listdir(directory):
    if filename.endswith(".json"):
        json_path = os.path.join(directory, filename)
        with open(json_path, 'r') as openfile:
            CTD_data = json.load(openfile)
        CTD_data_dict = CTD_data[0]
        
        # Extract the longitude and latitude from the 'DiveStart'. If the JSON file doesn't have any lon or lat values, the file is skipped. 
        lat = CTD_data_dict.get('dives', [])[0].get('trajectory', {}).get('gps', [])[0].get('lat', None)
        lon = CTD_data_dict.get('dives', [])[0].get('trajectory', {}).get('gps', [])[0].get('lon', None)
        if lat == 0 or lon == 0:
            continue

        # Define the temperature, salinity and pressure keys within the CTD data dictionary.
        temperature = CTD_data_dict.get('dives', [])[0].get('science', {}).get('ascending', {}).get('binned', {}).get('temperature', [])
        salinity = CTD_data_dict.get('dives', [])[0].get('science', {}).get('ascending', {}).get('binned', {}).get('salinity', [])
        pressure = np.array(CTD_data_dict.get('dives', [])[0].get('science', {}).get('ascending', {}).get('binned', {}).get('pressure', []))

        # Check that the temperature and pressure variables are the same length (i.e. no missing values)
        if len(temperature) == len(pressure) > 0:

            # Calculate depth based on pressure, using the gsw package.
            latitude = 69.2
            depth = -1 * gsw.z_from_p(pressure, latitude)

            # Create a dictionary with the keys for each desired variable. Convert this dictionary to a pandas DataFrame.
            data_dict = {'temperature': temperature, 'salinity': salinity, 'pressure': pressure, 'depth': depth,
                         'lon': [CTD_data_dict.get('dives', [])[0].get('trajectory', {}).get('gps', [])[0].get('lon', None)] * len(temperature),
                         'lat': [CTD_data_dict.get('dives', [])[0].get('trajectory', {}).get('gps', [])[0].get('lat', None)] * len(temperature),
                         'date': [CTD_data_dict.get('dives', [])[0].get('trajectory', {}).get('gps', [])[0].get('datetime', None)] * len(temperature)}
            df = pd.DataFrame(data_dict)

            # Extract the row of data sampled closest to the specified desired depth.
            closest_row = df.iloc[(df['depth'] - desired_depth).abs().argsort()[:1]]
            combined_df = pd.concat([combined_df, closest_row], ignore_index=True)

# Convert the 'date' column to the desired format
combined_df['date'] = pd.to_datetime(combined_df['date']).dt.strftime('%d/%m/%Y')
output_csv = os.path.join(directory, output_filename)  # Use the predefined variable for the output filename

# Save the combined DataFrame to an output CSV. 
combined_df.to_csv(output_csv, index=False)
print(f"Combined data saved to {output_csv}")

Combined data saved to R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_alamo_f9250/ALAMO_F9250_240m.csv


## OMG APEX Float (F9184)
The following code processes the data recorded by OMG APEX float F9184 (https://podaac.jpl.nasa.gov/dataset/OMG_L1_FLOAT_APEX). This float was deployed in Disko Bay in September 2020. This profile was autonomous, with data from each dive saved as to a 'science log' csv. To begin, the science logs are filtered, with profiles only kept if pressure, temperature and salinity data were provided. Using the filtered CSVs, depth is then calculated based upon pressure, using the Gibbs Sea Water Oceanographic Toolbox module. The longitude and latitude of each profile is extracted using the first GPS coordinate recorded (i.e. the start location). Each profile is then sampled at the desired depth (240m), with a threshold of 1m used. If there are multiple values within 1 m, the data provided at the closest depth to that desired is extracted.


In [11]:
# Define the directory containing the science log csvs.
input_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/science_logs/'

# Define the directory to output the filtered csvs, containing temperature, pressure and salinity data.
output_folder = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/filtered_science_logs/'

# Rows to export from defined based on the 'message column (export those with at least temperature, pressure and salinity data.)
target_strings = ['LGR_CP_PTS', 'LGR_CP_PTSC', 'LGR_CP_PTSCI']

# Define the columns to label in each output csv.
desired_columns = ['MESSAGE', 'TIME', 'PRESSURE', 'TEMP', 'SALINITY', 'CONDUCTIVITY', 'INTERNAL_TEMP']

# LOOP THROUGH CSV FILES IN THE INPUT DIRECTORY 
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_filepath = os.path.join(input_directory, filename)
        output_filename = f'filtered_{filename}'
        output_filepath = os.path.join(output_folder, output_filename)
        with open(input_filepath, 'r') as infile, open(output_filepath, 'w', newline='') as outfile:
            reader = csv.reader(infile)
            writer = csv.writer(outfile)
            # Write the new column names
            writer.writerow(desired_columns)
            # Iterate through rows and write only if the first column matches the target strings
            for row in reader:
                if row and row[0] in target_strings:
                    # Extract the values in the desired order
                    filtered_row = [row[0], row[1], row[2], row[3], row[4], row[5], row[6]]
                    writer.writerow(filtered_row)

print("The CSV files have been filtered and saved.")

The CSV files have been filtered and saved.


In [3]:
# Define the directory containing the filtered science log csvs.
input_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/filtered_science_logs/'

# Define the directory to output the filtered csvs, containing temperature, pressure and salinity data, with depth added.
output_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/filtered_science_logs_depth/'

# Latitude for Disko Bay (approximately 69.2 degrees North)
latitude = 69.2

# Loop through filtered CSV files in the input directory
for filename in os.listdir(input_directory):
    if filename.startswith('filtered_') and filename.endswith('.csv'):
        input_filepath = os.path.join(input_directory, filename)
        output_filepath = os.path.join(output_directory, filename)
        with open(input_filepath, 'r') as infile, open(output_filepath, 'w', newline='') as outfile:
            reader = csv.reader(infile)
            writer = csv.writer(outfile)
            header = next(reader)
            header.append('DEPTH')
            writer.writerow(header)
            for row in reader:
                message, time_str, pressure, temp, salinity, conductivity, internal_temp = row[:7]
                pressure_dbar = float(pressure)  # Pressure should be in dbar
                depth = gsw.z_from_p(pressure_dbar, latitude)
                depth = -1 * depth
                row.append(depth)
                writer.writerow(row)

print(" Depth has been calculated based on pressure. The updated CSVs have been saved.")

 Depth has been calculated based on pressure. The updated CSVs have been saved.


In [16]:
# Using the original science logs, extract the longitude and latitude of each profile. This is extracted as the first GPS coordinate recorded (i.e. start location).

original_folder_path = "R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/science_logs/" # Path to the folder containing the original CSV files
output_file = "R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/f9184_lon_lat.csv" # Path for a csv containing the extracted longitude and latitude of each profile.
extracted_rows = [] # Create an empty list to store the extracted rows
output_column_headings = ["FILENAME", "TIME", "LAT", "LON"] # Define the column headings for the output CSV

# Loop through each CSV file in the original folder
for filename in os.listdir(original_folder_path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(original_folder_path, filename)
        
        with open(csv_file_path, 'r') as file:
            csv_reader = csv.reader(file)
            gps_found = False
            
            # Loop through rows of the CSV file
            for row in csv_reader:
                if row and row[0].strip().lower() == 'gps':
                    if not gps_found:
                        # Extract latitude and longitude data
                        extracted_row = [filename] + row[1:-1]
                        extracted_rows.append(extracted_row)
                        gps_found = True
                        break  # Stop searching after finding GPS data

# Write the extracted data to the output CSV file
with open(output_file, 'w', newline='') as out_csv:
    csv_writer = csv.writer(out_csv)
    csv_writer.writerow(output_column_headings)
    csv_writer.writerows(extracted_rows)

print("The longitude and latitude locations of each profile have been extracted and saved.")

The longitude and latitude locations of each profile have been extracted and saved.


In [20]:
# Extract the profile data within 1 m of the desired depth. If there are multiple values within 1 m, the data provided at the closest depth to that desired is extracted.
input_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/filtered_science_logs_depth/' # Define the directory of the CSV inputs
save_directory = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/' # Define the directory for the CSV

# Read the csv file previously outputted containing the filename, lat and lon columns 
second_csv_file = 'R:/JAKOBSHAVN/CODE/github/jakobshavn_isbrae/data/omg_apex_f9184/f9184_lon_lat.csv'
second_csv = pd.read_csv(second_csv_file)

# Define the desired depth at which variables should be sampled, and the tolerance (1m)
desired_depth = 240
depth_tolerance = 1

# Create a combined CSV with variables extracted at the value closest to the desired depth.
dfs = []
for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_filename = filename.replace('filtered_', '').replace('.csv', '')
        prefix = input_filename.split('_')[0]
        df = pd.read_csv(os.path.join(input_directory, filename))
        depth_mask = (df['DEPTH'] >= (desired_depth - depth_tolerance)) & \
                     (df['DEPTH'] <= (desired_depth + depth_tolerance))
        relevant_data = df[depth_mask]
        
        if not relevant_data.empty:
            relevant_data = relevant_data.copy()  # Make a copy of the slice
            relevant_data.loc[:, 'depth_diff'] = (relevant_data['DEPTH'] - desired_depth).abs()
            closest_depth_row = relevant_data.loc[relevant_data['depth_diff'].idxmin()].copy() 
            closest_depth_row['filename'] = input_filename
            dfs.append(closest_depth_row)
        else:
            print(f"No data found within threshold for {filename}")

if dfs:
    combined_data = pd.concat(dfs, axis=1).T
    depth_str = str(desired_depth)
    depth_tolerance_str = str(depth_tolerance)
    combined_data = pd.merge(combined_data, second_csv[['FILENAME', 'LAT', 'LON']], left_on='filename', right_on='FILENAME', how='left')
    combined_data = combined_data.drop(['depth_diff', 'FILENAME'], axis=1)
    combined_data['DATE'] = pd.to_datetime(combined_data['TIME'], format='%Y%m%dT%H%M%S').dt.strftime('%d/%m/%Y')
    column_order = ['filename', 'LAT', 'LON', 'MESSAGE', 'TIME', 'DATE', 'PRESSURE', 'TEMP', 'SALINITY', 'CONDUCTIVITY', 'INTERNAL_TEMP', 'DEPTH']
    combined_data = combined_data[column_order]
    output_file = os.path.join(save_directory, f'OMG_APEX_F9184_{depth_str}m_{depth_tolerance_str}m.csv')
    combined_data.to_csv(output_file, index=False)
else:
    print(f"No data found within {depth_tolerance} meters of the desired depth.")

print("Data from OMG APEX F9184 sampled at {desired_depth} and saved to a CSV.")

No data found within threshold for filtered_OMG_APEX_F9184_Dive_001.20200913T134432.science_log.csv
No data found within threshold for filtered_OMG_APEX_F9184_Dive_002.20200914T102956.science_log.csv
No data found within threshold for filtered_OMG_APEX_F9184_Dive_010.20200927T111350.science_log.csv
No data found within threshold for filtered_OMG_APEX_F9184_Dive_016.20201014T201700.science_log.csv
No data found within threshold for filtered_OMG_APEX_F9184_Dive_023.20201104T021208.science_log.csv
No data found within threshold for filtered_OMG_APEX_F9184_Dive_025.20201109T215120.science_log.csv
Data from OMG APEX F9184 sampled at {desired_depth} and saved to a CSV.


## Greenland Ecosystem Monitoring 