# Subocean and CTD data merging + correction
The goal of this notebook is to show how the subocean data is currently beeing processed, and to have a base for asking the questions


In [1]:
import numpy as np
import pandas as pd
import os 
import glob
import matplotlib.pyplot as plt
import xarray as xr
import json
from pathlib import Path

In [2]:
# List subfolders
ctd_path = "C:/Users/cruz/Documents/SENSE/CTD_processing/data/Level1/Forel-GroupedStn"
subocean_path = "C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn"

In [3]:
subfolders = [f.path for f in os.scandir(ctd_path) if f.is_dir()]

print("Available subfolders:")
folderlist = []
for folder in subfolders:
    sub_folders = folder.split("/")[-1]
    sub_folders = sub_folders.split("\\")[-1]
    print(f"- {sub_folders}")
    folderlist.append(sub_folders)
 

Available subfolders:
- StnF0
- StnF1
- StnF2
- StnF3
- StnF4
- StnF5


## Select one of these subfolders, one correction at a time

In [4]:
subfolder = folderlist[1]

In [5]:
directory_ctd = os.path.join(ctd_path, subfolder)
ctd_file = glob.glob(directory_ctd + "/*.csv")
ctd_file = ctd_file[0]

In [6]:
glob.glob(directory_ctd + "/*")

['C:/Users/cruz/Documents/SENSE/CTD_processing/data/Level1/Forel-GroupedStn\\StnF1\\2024-07-04T024636 SBE0251267.csv']

In [7]:

directory_subocean = os.path.join(subocean_path, subfolder)
subocean_file = glob.glob(directory_subocean + "/*.txt")
subocean_log=  glob.glob(directory_subocean + "/*.log")
print(subocean_file)
print(subocean_log)
#Filter out elemenets of the list that dont contain the word "subocean"
subocean_file = [x for x in subocean_file if "SubOceanExperiment" in x]
subocean_file = subocean_file[1]
subocean_log = subocean_log[1]

['C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF1\\SubOceanExperiment2024-07-04T10-39-29.txt', 'C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF1\\SubOceanExperiment2024-07-04T15-39-33.txt']
['C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF1\\SubOceanExperiment2024-07-04T10-39-29.log', 'C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF1\\SubOceanExperiment2024-07-04T15-39-33.log']


In [8]:
subocean_df = pd.read_csv(subocean_file, sep='\t')
max_pressure_subocean = subocean_df["Hydrostatic pressure (bar)"].argmax()
subocean_df_downard = subocean_df.iloc[0:max_pressure_subocean]
subocean_df_upward = subocean_df.iloc[max_pressure_subocean:]
subocean_file_downward = subocean_file.replace(".txt", "_downward.txt").replace("raw", "Level0")
subocean_file_upward = subocean_file.replace(".txt", "_upward.txt").replace("raw", "Level0")

In [60]:
#Export to csv the downward and upward profiles
subocean_df_downard.to_csv(subocean_file_downward, sep='\t', index=False)
subocean_df_upward.to_csv(subocean_file_upward, sep='\t', index=False)

Open the two datasets (subocean and ctd data)

In [61]:
ctd_file

'C:/Users/cruz/Documents/SENSE/CTD_processing/data/Level1/Forel-GroupedStn\\StnF1\\2024-07-04T024636 SBE0251267.csv'

In [9]:
ctd_df = pd.read_csv(ctd_file)
ctd_ds = ctd_df.to_xarray()

In [10]:
CTD_pressure_col = "pressure_dbar"#Rename "pressure_dbar" to "Pres"
ctd_ds = ctd_ds.rename_vars({CTD_pressure_col:"Pres"})

In [12]:
ctd_ds["Pres"].max()

# Section 1: Preprocessing

# Formatting for A2PS

We work with xarray as it'better for multidimensional data and interpolation, but it need some standardization on variable names

We would like to make a match between the ctd and the subocean data, for that, we choose the pressure to be our matching coordinates variable. The profiles might not be done at the same time but can still be used.

In [64]:
# Set 'Depth' as coordinate for both datasets and remove the original index
ctd_ds = ctd_ds.swap_dims({'index': 'Pres'})
ctd_ds = ctd_ds.set_coords('Pres')
ctd_ds = ctd_ds.drop_vars('index')

We do not want a two way profile, so we separate the downward and upward profile

In [65]:
max_pressure_ctd = ctd_ds["Pres"].argmax()
ctd_ds_downard = ctd_ds.isel(Pres=slice(None, max_pressure_ctd.values))
ctd_ds_downard_unique = ctd_ds_downard.groupby("Pres").mean()
ctd_ds_downard_unique["Oxygen_percent"] = ctd_ds_downard_unique["oxygen_saturation_percent"]*0.21
#Rename "pressure_dbar" to "Pres"
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"Pres":"PrdE"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"temperature_C":"Tv2C"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"salinity_psu":"Sal2"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"Oxygen_percent":"Sbeox2PS"})

In [66]:
ctd_A2PS = ctd_ds_downard_unique[["Tv2C", "Sal2", "Sbeox2PS", "PrdE"]].to_dataframe()
ctd_A2PS.reset_index(inplace=True, drop=True)
#I want to drop duplicates of ctd_A2PS_int based on the PrdE column
ctd_A2PS_not_duplicated = ctd_A2PS.drop_duplicates(subset='PrdE', keep='first')
ctd_A2PS_not_duplicated = ctd_A2PS_not_duplicated.sort_values(by=["PrdE"], ascending=True)
formatted_ctd_file = ctd_file.replace(".csv", "_downard_formatted.asc").split("\\")[-1]
subocean_L0= directory_subocean.replace("raw", "Level0")
CTD_file_path_downward = subocean_L0 + "/"+ formatted_ctd_file
ctd_A2PS_not_duplicated.to_csv(CTD_file_path_downward, sep='\t', index=False)

In [67]:

ctd_ds_upward = ctd_ds.isel(Pres=slice(max_pressure_ctd.values, None))

Create the pressure grid

In [68]:
if len(ctd_ds_upward.Pres)>1:
    ctd_ds_upward_unique = ctd_ds_upward.groupby("Pres").mean()
    ctd_ds_upward_unique["Oxygen_percent"] = ctd_ds_upward_unique["oxygen_saturation_percent"]*0.21
    #Rename "pressure_dbar" to "Pres"
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"Pres":"PrdE"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"temperature_C":"Tv2C"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"salinity_psu":"Sal2"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"Oxygen_percent":"Sbeox2PS"})
    ctd_A2PS = ctd_ds_upward_unique[["Tv2C", "Sal2", "Sbeox2PS", "PrdE"]].to_dataframe()
    ctd_A2PS.reset_index(inplace=True, drop=True)
    #I want to drop duplicates of ctd_A2PS_int based on the PrdE column
    ctd_A2PS_not_duplicated = ctd_A2PS.drop_duplicates(subset='PrdE', keep='first')
    ctd_A2PS_not_duplicated = ctd_A2PS_not_duplicated.sort_values(by=["PrdE"], ascending=True)
    formatted_ctd_file = ctd_file.replace(".csv", "_upward_formatted.asc").split("\\")[-1]
    subocean_L0= directory_subocean.replace("raw", "Level0")
CTD_file_path_upward = subocean_L0 + "/"+ formatted_ctd_file
if len(ctd_ds_upward.Pres)>1:
    ctd_A2PS_not_duplicated.to_csv(CTD_file_path_upward, sep='\t', index=False)

In [69]:
import json

# Read the JSON file
def update_experiment_title(json_path, ctd_filepath, new_title):
    """
    Update experiment titles in JSON file
    
    Args:
        json_path (str): Path to JSON file
        new_title (str): New title to assign
    """
    # Read JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    # Update both title fields
    data["Title of the experiment"] = new_title.split("\\")[-1]
    data["Default title of the experiment"] = new_title.split("\\")[-1]
    data["CTD filepath"] = ctd_filepath
    data["CTD interpolation type"] = "Pressure"

    outpath = new_title.replace(".txt", ".log").replace("raw", "Level0")
    # Write updated JSON back to file
    with open(outpath, 'w') as f:
        json.dump(data, f, indent=4)

# Example usage
json_path = subocean_log
update_experiment_title(json_path, CTD_file_path_upward, subocean_file_upward)
update_experiment_title(json_path, CTD_file_path_downward, subocean_file_downward)