# ARGO study

This notebook demonstrates how we can use the concept of [data depth](https://arxiv.org/abs/1207.4988) to identify central and outlying points in the ARGO data.

Quite a bit of processing is needed to prepare the data for these analyses.  You should review the get_data.py, prep.py, and read.py scripts to understand how this is done.

In [None]:
import numpy as np
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from statsmodels.nonparametric.smoothers_lowess import lowess
import pandas as pd
from read import *

The temperature profiles are in the columns of the array 'temp', and the salinity profiles are in the columns of the array 'psal'.  Each profile is a vector of 100 values, corresponding to the pressures in the vector 'pressure'.

In [None]:
print(temp.shape)
print(psal.shape)

The columns of 'temp' and 'psal' are paired (they are collected at the same location and time). 

In [None]:
plt.plot(temp[:, 133], psal[:, 133])
plt.grid(True)
plt.xlabel("Temperature (C)")
plt.ylabel("Salinity")

The mean temperature and pressure are plotted below.  These plots are not very informative about the overall distribution of the data.

In [None]:
plt.clf()
plt.grid(True)
plt.plot(pressure, temp.mean(1))
plt.xlabel("Pressure (dbar)", size=15)
plt.ylabel("Mean temperature (C)", size=15)

In [None]:
plt.clf()
plt.grid(True)
plt.plot(pressure, psal.mean(1))
plt.xlabel("Pressure (dbar)", size=15)
plt.ylabel("Mean salinity (psal)", size=15)

The depth calculations are somewhat expensive so we subset the data for speed.

In [None]:
ii = np.random.choice(np.arange(temp.shape[1]), 5000, replace=False)
tempx = temp[:, ii]
psalx = psal[:, ii]
latx = lat[ii]
lonx = lon[ii]
dayx = day[ii]

Calculate the spatial depth of vector v relative to all columns of x.

In [None]:
def sdepth(v, x):
    p, n = x.shape
    z = x - v[:, None]
    zn = np.sqrt((z**2).sum(0))
    zn[np.abs(zn) < 1e-12] = np.inf
    z /= zn
    u = z.mean(1)
    return 1 - np.sqrt(np.sum(u**2))

Calculate the L2 depth of vector v relative to all columns of x.

In [None]:
def l2depth(v, x):
    p, n = x.shape
    z = x - v[:, None]
    zn = np.sqrt((z**2).sum(0))
    d = zn.mean()
    return 1e6 / (1 + d)

Estimate the band depth of vector v relative to all columns of x, using 500 random draws to estimate the band depth.

In [None]:
def bdepth(v, x, m=500):
    p, n = x.shape
    t = 0.0
    for k in range(m):
        ii = np.random.choice(n, 3, replace=False)
        z = x[:, ii]
        mn = z.min(1)
        mx = z.max(1)
        t += np.mean((v >= mn) & (v <= mx))
    t /= m
    return t

Calculate the depth of every column of 'x' relative to the other columns, using 'dfun' as the depth function.

In [None]:
def depths(x, dfun, progress=False):
    p, n = x.shape
    d = np.zeros(n)
    for i in range(n):
        if progress and (i % 200 == 0):
            print(i, end="", flush=True)
            print(".", end="", flush=True)
        d[i] = dfun(x[:, i], x)
    if progress:
        print("done", flush=True)
    return d

The function 'depth_cut' partitions the depth values into bins based on quantiles of the depth statistics, and plots a small random selection of profiles from each bin.  The arguments to this function are as follows:

* 'dpt' is a vector of depths 
* 'profiles' is an array of profiles (each column is a profile)
* 'q' is the number of bins into which the profiles are partitioned 
* 'pressure' is the pressures at which the profiles are recorded
* 'ylab' is a label for the vertical axis of the plot

In [None]:
def depth_cut(dpt, profiles, q, pressure, ylab):
    dpq = pd.qcut(dpt, q)
    for (i,iv) in enumerate(dpq.categories):
        ii = np.flatnonzero(dpq == iv)
        jj = np.random.choice(ii, 10)

        plt.clf()
        plt.figure(figsize=(6.4,4.8))
        plt.grid(True)
        plt.title("Depth quantile %d %s" % (i + 1, str(iv)))
        for j in jj:
            plt.plot(pressure, profiles[:, j], "-", color="grey")
        if ylab == "Temperature":
            plt.ylim(-5, 35)
        else:
            plt.ylim(32, 37)
        plt.ylabel(ylab, size=15)
        plt.xlabel("Pressure", size=15)
        plt.show()

In [None]:
dp_temp = depths(tempx, sdepth, progress=True)
dp_psal = depths(psalx, sdepth, progress=True)

Visualize the profiles as arranged according to their depth values.

In [None]:
q = 10
depth_cut(dp_temp, tempx, q, pressure, "Temperature")

In [None]:
depth_cut(dp_psal, psalx, q, pressure, "Salinity")

The function 'depth_correlates' aims to undertand how the depth values are related to potential explanatory variables, of which we have three: latitude, longitude, and day of year.

The function plots the estimated conditional mean log depth relative to each explanatory variable . The bands in these plots are +/- f times the mean absolute deviation from the conditional mean.

In [None]:
def depth_correlates(dp, lat, lon, day, title, f=2):
    dpx = pd.DataFrame({"depth": np.log(dp), "lat": lat, "lon": lon, "day": day})
    vn = {"lat": "Latitude", "lon": "Longitude", "day": "Day"}
    for ev in ["lat", "lon", "day"]:

        # A grid of 100 values covering the range of the
        # explanatory variable.
        xx = np.linspace(dpx[ev].min(), dpx[ev].max(), 100)

        # Use loess to regress depth on the explanatory variable
        m = lowess(dpx["depth"], dpx[ev])
        dh = interp1d(m[:, 0], m[:, 1])(xx)

        # Use loess to regress the absolute depth residuals
        # on the explanatory variable
        aresid = np.abs(m[:, 1] - dpx["depth"])
        r = lowess(aresid, dpx[ev])
        dq = interp1d(r[:, 0], r[:, 1])(xx)

        plt.clf()
        plt.title(title)
        plt.grid(True)
        plt.plot(xx, dh, "-")
        plt.fill_between(xx, dh-f*dq, dh+f*dq, color="grey", alpha=0.5)
        plt.xlabel(vn[ev], size=15)
        plt.ylabel("Depth", size=15)
        plt.show()

In [None]:
depth_correlates(dp_temp, latx, lonx, dayx, "Temperature")

In [None]:
depth_correlates(dp_psal, latx, lonx, dayx, "Salinity")

The following plots show the results only for the northern hemisphere.

In [None]:
ii = np.flatnonzero(latx > 0)
depth_correlates(dp_temp[ii], latx[ii], lonx[ii], dayx[ii], "Northern hemisphere temperature")

In [None]:
depth_correlates(dp_psal[ii], latx[ii], lonx[ii], dayx[ii], "Northern hemisphere salinity")

The following plots show the results only for the southern hemisphere.

In [None]:
ii = np.flatnonzero(latx < 0)
depth_correlates(dp_temp[ii], latx[ii], lonx[ii], dayx[ii], "Southern hemisphere temperature")

In [None]:
depth_correlates(dp_psal[ii], latx[ii], lonx[ii], dayx[ii], "Southern hemisphere salinity")

The function 'plot_depth_map' plots the profile locations on a map, indicating which are the most shallow and which are the most deep.

In [None]:
def plot_depth_map(dp, title):
    plt.clf()
    plt.figure(figsize=(8, 7.25))
    ax = plt.axes([0.05, 0.05, 0.84, 0.88], projection=ccrs.PlateCarree(central_longitude=180))
    ax.coastlines()
    ax.set_extent([115, 290, -70, 60])

    qq = np.argsort(np.argsort(dp)) / len(dp)
    ii = np.flatnonzero(qq < 0.1)
    plt.scatter(lonx[ii], latx[ii], s=8, color="blue", label="shallow",
                transform=ccrs.Geodetic())
    ii = np.flatnonzero(qq > 0.9)
    plt.scatter(lonx[ii], latx[ii], s=8, color="red", label="deep",
                transform=ccrs.Geodetic())

    ha,lb = plt.gca().get_legend_handles_labels()
    leg = plt.figlegend(ha, lb, loc="center right", handletextpad=0.01)
    leg.draw_frame(False)

    plt.title(title)
    plt.show()

In [None]:
plot_depth_map(dp_temp, "Temperature")

In [None]:
plot_depth_map(dp_psal, "Salinity")