In [1]:
import netCDF4 as nc
from netCDF4 import Dataset
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# gsw oceanic toolbox: http://www.teos-10.org/pubs/Getting_Started.pdf
import gsw
from scipy.io import loadmat
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt

import tqdm
import sys
import os
sys.path.append('../..')
from src.features.processing_func import mld
from src.features.processing_func import check_coords, calc_N2_kappa, calc_hab, arctic_calchab, calc_N2_kappa_sorted, mld
from src.features.calc_seaice import calc_SIC
from src.utils.directories import get_parent_directory
from src.features.feature_generation import processing_functions

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [2]:
parent_dir = get_parent_directory()

arctic_mix = os.path.join(parent_dir, "data/interim/arctic_mix.nc")
asbo_nc = os.path.join(parent_dir, "data/interim/ASBO-TEACOSI_ds.nc")
mosaic_nc = os.path.join(parent_dir, "data/interim/mosaic_ds.nc")
nice_nc = os.path.join(parent_dir, "data/interim/nice_ds.nc")
HM_nc = os.path.join(parent_dir, "data/interim/HM_ds.nc")
barneo2007_nc = os.path.join(parent_dir, "data/interim/barneo2007_ds.nc")
barneo2008_nc = os.path.join(parent_dir, "data/interim/barneo2008_ds.nc")
KB2018616_nc = os.path.join(parent_dir, "data/interim/KB2018616.nc")
KH2018709_nc = os.path.join(parent_dir, "data/interim/KH2018709.nc")
ascos_nc = os.path.join(parent_dir, "data/interim/ascos_ds.nc")

arctic_ds = xr.open_dataset(arctic_mix)
asbo_ds = xr.open_dataset(asbo_nc)
mosaic_ds = xr.open_dataset(mosaic_nc)
nice_ds = xr.open_dataset(nice_nc)
HM_ds = xr.open_dataset(HM_nc)
barneo2007_ds = xr.open_dataset(barneo2007_nc)
barneo2008_ds = xr.open_dataset(barneo2008_nc)
KB2018616_ds = xr.open_dataset(KB2018616_nc)
KH2018709_ds = xr.open_dataset(KH2018709_nc)
ascos_ds = xr.open_dataset(ascos_nc)

In [3]:
import xarray as xr
arctic_ds = arctic_ds.rename({"longitude": "lon", "latitude": "lat"})
arctic_ds["latitude"] = arctic_ds.lat
arctic_ds["longitude"] = arctic_ds.lon

In [28]:
asbo_ds = asbo_ds.rename({"longitude": "lon", "latitude": "lat"})
asbo_ds["latitude"] = asbo_ds.lat
asbo_ds["longitude"] = asbo_ds.lon

In [4]:
# Bathymetry dataset
GEBCO_ds = os.path.join(parent_dir, "data/external/GEBCO/gebco_2022_n80.0_s63.0_w-170.0_e-130.0.nc")
bathy_ds = xr.open_dataset(GEBCO_ds)

In [5]:
# Sea ice fraction data
SI_HadISST = os.path.join(parent_dir, "data/external/SI-area/HadISST_ice.nc")
Hadi_SI = xr.open_dataset(SI_HadISST)

## Add all features and combine all datasets into one dataframe
The features and the plots are explained below

In [6]:
# add variables of the cruise name
arctic_ds["cruise"] = "ArcticMix"
nice_ds["cruise"] = "NICE-2015"
mosaic_ds["cruise"] = "Mosaic"
asbo_ds["cruise"] = "ASBO"
HM_ds["cruise"] = "Haakon Mosby"
barneo2007_ds["cruise"] = "IPY Barneo 2007"
barneo2008_ds["cruise"] = "IPY Barneo 2008"
KB2018616_ds["cruise"] = "Nansen Legacy 2018"
KH2018709_ds["cruise"] = "Nansen Legacy 2019"
ascos_ds["cruise"] = "ASCOS"

In [7]:
selected_columns = ["depth", "profile", "cruise", "latitude", "longitude", "S", "T", "log_eps", "log_N2", "dTdz", "dSdz", "hab", "Tu", "Tu_label", "time", "Rsubrho", "sea_ice_concentration", "MLDJ", "MLDI"]
# selected_columns = ["depth", "profile", "latitude", "longitude", "P", "S", "T", "Tu", "kappa", "log_N2", "log_kappa", "log_eps", "dTdz", "dSdz", "eps", "cruise", "hab", "Tu_label", "time"]

In [8]:
# Mosaic dataset only includes the log epsilon
mosaic_ds["log_eps"] = mosaic_ds["eps"]

In [10]:
nice_ds = processing_functions(nice_ds, selected_columns, Hadi_SI, bathy_ds)
nice_df = nice_ds.to_dataframe().reset_index()
nice_df.to_pickle(os.path.join(parent_dir, "data/interim/nice_df.pkl"))

In [8]:
mosaic_ds = processing_functions(mosaic_ds, selected_columns, Hadi_SI, bathy_ds)
mosaic_df = mosaic_ds.to_dataframe().reset_index()
mosaic_df.to_pickle(os.path.join(parent_dir, "data/interim/mosaic_df.pkl"))

In [10]:
HM_ds = processing_functions(HM_ds, selected_columns, Hadi_SI, bathy_ds)
HM_df = HM_ds.to_dataframe().reset_index()
HM_df.to_pickle(os.path.join(parent_dir, "data/interim/HM_df.pkl"))

In [29]:
asbo_ds = processing_functions(asbo_ds, selected_columns, Hadi_SI, bathy_ds)
asbo_df = asbo_ds.to_dataframe().reset_index()
asbo_df.to_pickle(os.path.join(parent_dir, "data/interim/asbo_df.pkl"))

In [10]:
barneo2007_ds = processing_functions(barneo2007_ds, selected_columns, Hadi_SI, bathy_ds)
barneo2007_df = barneo2007_ds.to_dataframe().reset_index()
barneo2007_df.to_pickle(os.path.join(parent_dir, "data/interim/barneo2007_df.pkl"))
barneo2008_df = processing_functions(barneo2008_ds, selected_columns, Hadi_SI, bathy_ds)
barneo2008_df = barneo2008_ds.to_dataframe().reset_index()
barneo2008_df.to_pickle(os.path.join(parent_dir, "data/interim/barneo2008_df.pkl"))

KB2018616_ds = processing_functions(KB2018616_ds, selected_columns, Hadi_SI, bathy_ds)
KB2018616_df = KB2018616_ds.to_dataframe().reset_index()
KB2018616_df.to_pickle(os.path.join(parent_dir, "data/interim/KB2018616_df.pkl"))
KH2018709_ds = processing_functions(KH2018709_ds, selected_columns, Hadi_SI, bathy_ds)
KH2018709_df = KH2018709_ds.to_dataframe().reset_index()
KH2018709_df.to_pickle(os.path.join(parent_dir, "data/interim/KH2018709_df.pkl"))

In [11]:
ascos_ds = processing_functions(ascos_ds, selected_columns, Hadi_SI, bathy_ds)
ascos_df = ascos_ds.to_dataframe().reset_index()
ascos_df.to_pickle(os.path.join(parent_dir, "data/interim/ascos_df.pkl"))

In [9]:
arctic_ds = processing_functions(arctic_ds, selected_columns, Hadi_SI, bathy_ds, True)
arctic_df = arctic_ds.to_dataframe().reset_index()
arctic_df.to_pickle(os.path.join(parent_dir, "data/interim/arctic_df.pkl"))

## Open saved dataframes

In [10]:
arctic_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/arctic_df.pkl")) # no lon and lat
nice_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/nice_df.pkl"))
mosaic_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/mosaic_df.pkl"))
ascos_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/ascos_df.pkl"))
asbo_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/asbo_df.pkl")) # geen lat en lon
HM_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/HM_df.pkl"))
barneo2007_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/barneo2007_df.pkl"))
barneo2008_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/barneo2008_df.pkl"))
KB2018616_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/KB2018616_df.pkl"))
KH2018709_df = pd.read_pickle(os.path.join(parent_dir, "data/interim/KH2018709_df.pkl"))

In [13]:
arctic_df.describe()
arctic_df["sea_ice_concentration"] = 0

In [17]:
combined_df = pd.concat([nice_df,arctic_df, mosaic_df, HM_df, asbo_df, barneo2007_df, barneo2008_df, KB2018616_df, KH2018709_df, ascos_df])
combined_nona = combined_df.dropna()
combined_nona.to_pickle(os.path.join(parent_dir, "data/processed/ml_ready/1406_ML.pkl"))

In [30]:
dfs = [nice_df,arctic_df, mosaic_df, HM_df, asbo_df, barneo2007_df, barneo2008_df, KB2018616_df, KH2018709_df, ascos_df]

In [31]:
selected_columns = ["depth", "profile", "cruise", "latitude", "longitude", "S", "T", "log_eps", "log_N2", "dTdz", "dSdz", "hab", "Tu", "Tu_label", "Rsubrho", "sea_ice_concentration", "MLDJ", "MLDI"]

In [33]:
# Select columns from each DataFrame and concatenate them
combined_df = pd.concat([df[selected_columns] for df in dfs])
combined_nona = combined_df.dropna()
combined_nona.to_pickle(os.path.join(parent_dir, "data/processed/ml_ready/1406_ML.pkl"))