# Subocean and CTD data merging + correction
The goal of this notebook is to show how the subocean data is currently beeing processed, and to have a base for asking the questions


In [1]:
import numpy as np
import pandas as pd
import os 
import glob
import matplotlib.pyplot as plt
import xarray as xr
import json
from pathlib import Path

In [2]:
# List subfolders
ctd_path = "C:/Users/cruz/Documents/SENSE/CTD_processing/data/Level1/Forel-GroupedStn"
subocean_path = "C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn"

In [3]:
subfolders = [f.path for f in os.scandir(ctd_path) if f.is_dir()]

print("Available subfolders:")
folderlist = []
for folder in subfolders:
    sub_folders = folder.split("/")[-1]
    sub_folders = sub_folders.split("\\")[-1]
    print(f"- {sub_folders}")
    folderlist.append(sub_folders)
 

Available subfolders:
- StnF0
- StnF1
- StnF2
- StnF3
- StnF4
- StnF5


## Select one of these subfolders, one correction at a time

In [4]:
subfolder = folderlist[3]

In [5]:
directory_ctd = os.path.join(ctd_path, subfolder)
ctd_file = glob.glob(directory_ctd + "/*.csv")
ctd_file = ctd_file[0]

In [6]:
glob.glob(directory_ctd + "/*")

['C:/Users/cruz/Documents/SENSE/CTD_processing/data/Level1/Forel-GroupedStn\\StnF3\\20240706_1548_idronaut.csv']

In [7]:

directory_subocean = os.path.join(subocean_path, subfolder)
subocean_file = glob.glob(directory_subocean + "/*.txt")
subocean_log=  glob.glob(directory_subocean + "/*.log")
print(subocean_file)
print(subocean_log)
#Filter out elemenets of the list that dont contain the word "subocean"
subocean_file = [x for x in subocean_file if "SubOceanExperiment" in x]
subocean_file = subocean_file[0]
subocean_log = subocean_log[0]

['C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF3\\20240706_1548_idronaut.txt', 'C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF3\\SubOceanExperiment2024-07-06T15-20-18.txt']
['C:/Users/cruz/Documents/SENSE/SubOcean/data/raw/Forel-GroupedStn\\StnF3\\SubOceanExperiment2024-07-06T15-20-18.log']


In [8]:
subocean_df = pd.read_csv(subocean_file, sep='\t')
max_pressure_subocean = subocean_df["Hydrostatic pressure (bar)"].argmax()
subocean_df_downard = subocean_df.iloc[0:max_pressure_subocean]
subocean_df_upward = subocean_df.iloc[max_pressure_subocean:]
subocean_file_downward = subocean_file.replace(".txt", "_downward.txt").replace("raw", "Level0")
subocean_file_upward = subocean_file.replace(".txt", "_upward.txt").replace("raw", "Level0")

In [33]:
#Export to csv the downward and upward profiles
subocean_df_downard.to_csv(subocean_file_downward, sep='\t', index=False)

In [None]:

subocean_df_upward.to_csv(subocean_file_upward, sep='\t', index=False)
subocean_df_downard

Unnamed: 0,Date,Time,Date calibrated,Time calibrated,[CH4] dissolved with water vapour (ppm),[CH4] dissolved with water vapour (nmol/L),[CH4] dissolved with constant dry gas flow (ppm),[CH4] dissolved with constant dry gas flow (nmol/L),[C2H6] dissolved (ppm),Delta 13 CH4 (per-mille),...,Error Standard,Ringdown Time (microSec),Box Temperature (Degree Celsius),Box Pressure (mbar),PWM Cellule Temperature,PWM Cellule Pressure,Laser Temperature (Degree Celsius),Laser Flux,Norm Signal,Value Max
0,2024/07/06,15:20:19,2024/07/06,15:20:35,7.03563,7.80050,81.77140,90.66110,0.057837,163.4310,...,0.0145,13.165,25.323,4.319,34.01,31.99,15.897,54.871,1.778,0.7042
1,2024/07/06,15:20:20,2024/07/06,15:20:36,7.70552,8.54322,89.59380,99.33390,0.028910,377.1150,...,0.0129,13.165,25.323,3.831,34.01,31.99,15.897,54.871,1.778,0.7041
2,2024/07/06,15:20:21,2024/07/06,15:20:37,7.54147,8.36133,87.66680,97.19740,0.048186,-66.1569,...,0.0107,13.135,25.324,3.392,34.57,31.99,15.893,54.871,1.778,0.7040
3,2024/07/06,15:20:22,2024/07/06,15:20:38,7.45277,8.26299,86.60620,96.02150,0.077123,-63.5581,...,0.0132,13.150,25.324,3.733,33.91,31.99,15.893,54.871,1.778,0.7039
4,2024/07/06,15:20:23,2024/07/06,15:20:39,7.11825,7.89210,82.76630,91.76410,0.062631,-29.2682,...,0.0179,13.135,25.326,4.197,33.95,31.99,15.900,54.871,1.778,0.7039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4132,2024/07/06,16:29:16,2024/07/06,16:29:41,14.84290,16.45650,14.43630,16.00570,3.395850,859.1270,...,0.0294,12.370,18.828,3.636,39.79,31.99,15.890,54.852,1.823,0.6793
4133,2024/07/06,16:29:17,2024/07/06,16:29:42,13.89610,15.40680,13.60190,15.08060,4.094250,727.6880,...,0.0346,12.354,18.828,4.002,39.79,31.99,15.895,54.852,1.823,0.6795
4134,2024/07/06,16:29:18,2024/07/06,16:29:43,7.46543,8.27702,7.14801,7.92510,2.198070,840.3070,...,0.0289,12.324,18.830,4.197,39.62,31.99,15.893,54.856,1.824,0.6797
4135,2024/07/06,16:29:19,2024/07/06,16:29:44,11.77920,13.05970,12.53700,13.89990,1.701090,-213.5220,...,0.0383,12.324,18.834,3.440,39.62,31.99,15.890,54.852,1.824,0.6802


In [31]:
subocean_df_upward

Unnamed: 0,Date,Time,Date calibrated,Time calibrated,[CH4] dissolved with water vapour (ppm),[CH4] dissolved with water vapour (nmol/L),[CH4] dissolved with constant dry gas flow (ppm),[CH4] dissolved with constant dry gas flow (nmol/L),[C2H6] dissolved (ppm),Delta 13 CH4 (per-mille),...,Error Standard,Ringdown Time (microSec),Box Temperature (Degree Celsius),Box Pressure (mbar),PWM Cellule Temperature,PWM Cellule Pressure,Laser Temperature (Degree Celsius),Laser Flux,Norm Signal,Value Max
4137,2024/07/06,16:29:21,2024/07/06,16:29:46,7.43452,8.24275,8.00657,8.87700,3.43299,-443.505,...,0.0358,12.354,18.832,4.343,39.80,31.99,15.896,54.852,1.825,0.6795
4138,2024/07/06,16:29:22,2024/07/06,16:29:47,6.39018,7.08489,7.22629,8.01189,2.27634,-1580.250,...,0.0371,12.335,18.830,4.197,39.29,31.99,15.891,54.849,1.825,0.6789
4139,2024/07/06,16:29:23,2024/07/06,16:29:48,6.00602,6.65896,6.71524,7.44529,2.26744,954.239,...,0.0391,12.349,18.830,3.758,39.29,31.99,15.895,54.849,1.825,0.6793
4140,2024/07/06,16:29:24,2024/07/06,16:29:49,14.15870,15.69800,14.06270,15.59150,3.93578,266.508,...,0.0373,12.361,18.830,10.346,39.12,31.99,15.896,54.852,1.826,0.6786
4141,2024/07/06,16:29:25,2024/07/06,16:29:50,9.63003,10.67690,11.13200,12.34220,2.72465,802.182,...,0.0440,12.371,18.832,4.172,40.84,31.99,15.891,54.856,1.830,0.6790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,2024/07/06,17:10:11,2024/07/06,17:10:27,,,,,,,...,0.0043,14.416,19.534,3.977,43.37,31.99,15.896,54.856,2.428,1.0490
6585,2024/07/06,17:10:12,2024/07/06,17:10:28,,,,,,,...,0.0040,14.408,19.541,4.148,42.00,31.99,15.897,54.856,2.428,1.0491
6586,2024/07/06,17:10:13,2024/07/06,17:10:29,,,,,,,...,0.0055,14.404,19.548,4.612,43.38,31.99,15.896,54.856,2.428,1.0487
6587,2024/07/06,17:10:14,2024/07/06,17:10:30,,,,,,,...,0.0043,14.404,19.555,4.197,43.21,31.99,15.893,54.856,2.428,1.0486


Open the two datasets (subocean and ctd data)

In [11]:
ctd_df = pd.read_csv(ctd_file)
ctd_ds = ctd_df.to_xarray()

In [12]:
CTD_pressure_col = "pressure_dbar"#Rename "pressure_dbar" to "Pres"
ctd_ds = ctd_ds.rename_vars({CTD_pressure_col:"Pres"})

# Section 1: Preprocessing

# Formatting for A2PS

We work with xarray as it'better for multidimensional data and interpolation, but it need some standardization on variable names

We would like to make a match between the ctd and the subocean data, for that, we choose the pressure to be our matching coordinates variable. The profiles might not be done at the same time but can still be used.

In [13]:
# Set 'Depth' as coordinate for both datasets and remove the original index
ctd_ds = ctd_ds.swap_dims({'index': 'Pres'})
ctd_ds = ctd_ds.set_coords('Pres')
ctd_ds = ctd_ds.drop_vars('index')

We do not want a two way profile, so we separate the downward and upward profile

In [14]:
max_pressure_ctd = ctd_ds["Pres"].argmax()
ctd_ds_downard = ctd_ds.isel(Pres=slice(None, max_pressure_ctd.values))
ctd_ds_downard_unique = ctd_ds_downard.groupby("Pres").mean()
ctd_ds_downard_unique["Oxygen_percent"] = ctd_ds_downard_unique["oxygen_saturation_percent"]*0.21
#Rename "pressure_dbar" to "Pres"
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"Pres":"PrdE"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"temperature_C":"Tv2C"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"salinity_psu":"Sal2"})
ctd_ds_downard_unique = ctd_ds_downard_unique.rename_vars({"Oxygen_percent":"Sbeox2PS"})

In [15]:
ctd_A2PS = ctd_ds_downard_unique[["Tv2C", "Sal2", "Sbeox2PS", "PrdE"]].to_dataframe()
ctd_A2PS.reset_index(inplace=True, drop=True)
#I want to drop duplicates of ctd_A2PS_int based on the PrdE column
ctd_A2PS_not_duplicated = ctd_A2PS.drop_duplicates(subset='PrdE', keep='first')
ctd_A2PS_not_duplicated = ctd_A2PS_not_duplicated.sort_values(by=["PrdE"], ascending=True)
formatted_ctd_file = ctd_file.replace(".csv", "_downard_formatted.asc").split("\\")[-1]
subocean_L0= directory_subocean.replace("raw", "Level0")
CTD_file_path_downward = subocean_L0 + "/"+ formatted_ctd_file
ctd_A2PS_not_duplicated.to_csv(CTD_file_path_downward, sep='\t', index=False)

In [16]:
ctd_ds_upward = ctd_ds.isel(Pres=slice(max_pressure_ctd.values, None))
if len(ctd_ds_upward.Pres)>1:
    ctd_ds_upward_unique = ctd_ds_upward.groupby("Pres").mean()
    ctd_ds_upward_unique["Oxygen_percent"] = ctd_ds_upward_unique["oxygen_saturation_percent"]*0.21
    #Rename "pressure_dbar" to "Pres"
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"Pres":"PrdE"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"temperature_C":"Tv2C"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"salinity_psu":"Sal2"})
    ctd_ds_upward_unique = ctd_ds_upward_unique.rename_vars({"Oxygen_percent":"Sbeox2PS"})
    ctd_A2PS = ctd_ds_upward_unique[["Tv2C", "Sal2", "Sbeox2PS", "PrdE"]].to_dataframe()
    ctd_A2PS.reset_index(inplace=True, drop=True)
    #I want to drop duplicates of ctd_A2PS_int based on the PrdE column
    ctd_A2PS_not_duplicated = ctd_A2PS.drop_duplicates(subset='PrdE', keep='first')
    ctd_A2PS_not_duplicated = ctd_A2PS_not_duplicated.sort_values(by=["PrdE"], ascending=True)
    formatted_ctd_file = ctd_file.replace(".csv", "_upward_formatted.asc").split("\\")[-1]
    subocean_L0= directory_subocean.replace("raw", "Level0")
CTD_file_path_upward = subocean_L0 + "/"+ formatted_ctd_file
if len(ctd_ds_upward.Pres)>1:
    ctd_A2PS_not_duplicated.to_csv(CTD_file_path_upward, sep='\t', index=False)

In [17]:
import json

# Read the JSON file
def update_experiment_title(json_path, ctd_filepath, new_title):
    """
    Update experiment titles in JSON file
    
    Args:
        json_path (str): Path to JSON file
        new_title (str): New title to assign
    """
    # Read JSON file
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    # Update both title fields
    data["Title of the experiment"] = new_title.split("\\")[-1]
    data["Default title of the experiment"] = new_title.split("\\")[-1]
    data["CTD filepath"] = ctd_filepath
    data["CTD interpolation type"] = "Pressure"

    outpath = new_title.replace(".txt", ".log").replace("raw", "Level0")
    # Write updated JSON back to file
    with open(outpath, 'w') as f:
        json.dump(data, f, indent=4)

# Example usage
json_path = subocean_log
update_experiment_title(json_path, CTD_file_path_upward, subocean_file_upward)
update_experiment_title(json_path, CTD_file_path_downward, subocean_file_downward)