# Dimension reduction analyses of the ARGO data

The Argo data consist of paired observations of temperature and salinity, measured simultaneously at the same location at a series of depths beneath the ocean surface.  Each observation is comprised of two 100-dimensional vectors, one of temperature and one of salinity, each viewed as a function of pressure.  These vectors may be referred to as "profiles".  Here we use dimension reduction methods including Principal Components Analysis (PCA), Canonical Correlation Analysis (CCA), and Sliced Inverse Regression (SIR) to better understand these profiles.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from scipy.interpolate import interp1d
from statsmodels.nonparametric.smoothers_lowess import lowess
import statsmodels.api as sm
from statsmodels.regression.dimred import SIR
from read import *

The observations (profiles) are in the columns and the pressure levels are in the rows:

In [None]:
print(temp.shape)
print(psal.shape)

Construct a variable called 'day' that is the day within the year (counting from January 1st).

In [None]:
date = pd.to_datetime(date)
day = np.asarray(date.dayofyear)

Get the [circular mean](https://en.wikipedia.org/wiki/Circular_mean) of all longitude values.  This isn't related to dimension reduction but is useful to know for perspective.

In [None]:
lon_sin = np.sin(2*np.pi*lon/360)
lon_cos = np.cos(2*np.pi*lon/360)
cmean_lon = np.arctan2(lon_sin.mean(), lon_cos.mean()) * 360 / (2*np.pi)
cmean_lon

Translate the longitude values so that the Pacific ocean doesn't wrap around the origin.

In [None]:
lonx = (lon + 60) % 360

Create a matrix of observed variables that describe the location and time at which each profile was obtained.

In [None]:
n = len(lat)
Y = np.zeros((n, 4))
Y[:, 0] = lat
Y[:, 1] = lon  # raw longitude
Y[:, 2] = lonx # translated longitude
Y[:, 3] = day
Y.shape

Plot the profile locations in the actual latitude/longitude coordinates.  Longitude 0 is the prime meridian (passes through England).

In [None]:
plt.clf()
plt.grid(True)
plt.plot(lon, lat, "o", rasterized=True, alpha=0.1)
plt.xlabel("Longitude", size=15)
plt.ylabel("Latitude", size=15)
plt.title("Float positions")

Plot the profile locations using translated longitudes.

In [None]:
plt.clf()
plt.grid(True)
plt.plot(lonx, lat, "o", rasterized=True, alpha=0.1)
plt.xlabel("Longitude (translated)", size=15)
plt.ylabel("Latitude", size=15)
plt.title("Float positions")

In [None]:
def get_pcs(x):
    """
    Get the principal components of the data in 'x', treating the rows as the
    variables and the columns as the observations.
    """
    xc = x.copy()
    xm = x.mean(1)
    xc -= xm[:, None]
    cc = np.cov(xc)
    pcw, pcv = np.linalg.eigh(cc)

    # Reorder the PC's so that the dominant factors
    # are first.
    ii = np.argsort(pcw)[::-1]
    pcw = pcw[ii]
    pcv = pcv[:, ii]

    # For interpretability flip the PC's that are
    # mostly negative.
    for j in range(pcv.shape[1]):
        if (pcv[:, j] < 0).sum() > (pcv[:, j] >= 0).sum():
            pcv[:, j] *= -1

    # Get the PC scores
    scores = np.dot(xc.T, pcv[:, 0:5])

    return xm, pcw, pcv, scores

In [None]:
def pcplot(j, mean, pcv, scores, label):
    """
    Generate some plots that aid in interpreting the j^th PC factor.
    """
    # Plot the PC loadings
    plt.clf()
    plt.grid(True)
    plt.plot(pressure, pcv[:, j])
    plt.gca().set_xlabel("Pressure", size=15)
    plt.gca().set_ylabel("%s PC %d loading" % (label.title(), j + 1), size=15)
    if pcv[:, j].min() > 0:
        plt.gca().set_ylim(ymin=0)
    plt.show()

    # Plot the mean +/- 1 SD of the loading pattern
    plt.clf()
    plt.title(label.title())
    plt.grid(True)
    s = scores[:, j].std()
    for f in [-1, 0, 1]:
        plt.plot(pressure, mean + f*s*pcv[:, j], color={-1: "blue", 0: "black", 1: "red"}[f])
    plt.gca().set_xlabel("Pressure (Dbar)", size=15)
    plt.gca().set_ylabel("Mean %s +/- PC %d loading" % (label, j + 1), size=15)
    plt.show()

    # Plot the conditional mean PC score against an observed variable,
    # showing the conditional mean plus/minus one mean absolute deviation.
    fn = ["Latitude", "Longitude", "Longitude", "Day"]
    for k in [0,1,3]:
        xx = np.linspace(Y[:, k].min(), Y[:, k].max(), 100)
        m = lowess(scores[:, j], Y[:, k], delta=0.01*np.ptp(Y[:, k]))
        resid = scores[:, j] - m[:, 1]
        r = lowess(np.abs(resid), Y[:, k], delta=0.01*np.ptp(Y[:, k]))
        yy = interp1d(m[:, 0], m[:, 1])(xx)
        yr = interp1d(r[:, 0], r[:, 1])(xx)
        f = 2
        ymx = (yy + f*yr).max()
        ymn = (yy - f*yr).min()
        plt.clf()
        plt.grid(True)
        plt.plot(xx, yy, "-", color="red")
        plt.plot(xx, yy-f*yr, "-", color="grey")
        plt.plot(xx, yy+f*yr, "-", color="grey")
        plt.gca().set_ylim([ymn, ymx])
        plt.gca().set_xlabel(fn[k], size=15)
        plt.gca().set_ylabel("%s PC %d score" % (label.title(), j + 1), size=15)
        plt.show()

Calculate the principal components for temperature and salinity.

In [None]:
tempmean, tempw, tempv, tempscores = get_pcs(temp)
psalmean, psalw, psalv, psalscores = get_pcs(psal)

Plot the mean profiles for temperature and salinity.

In [None]:
for ti,da in zip(["Temperature", "Salinity"], [tempmean, psalmean]):
    plt.clf()
    plt.grid(True)
    plt.plot(pressure, da)
    plt.gca().set_xlabel("Pressure (dBar)", size=15)
    plt.gca().set_ylabel("Mean %s" % ti, size=15)
    plt.show()

Generate a sequence of plots that help us understand the temperature profiles through PC analysis.

In [None]:
for j in [0, 1, 2]:
    pcplot(j, tempmean, tempv, tempscores, "temperature")

Generate a sequence of plots that help us understand the salinity data through PC analysis.

In [None]:
for j in [0, 1, 2]:
    pcplot(j, psalmean, psalv, psalscores, "salinity")

Another way to understand the meaning of the PCA results is to identify the geographic locations where profiles score at the high end or the low end of the distribution of scores for each PC.  These plots are shown below.

In [None]:
def plot_pc_map(j, tempscores, Y, title):
    plt.clf()
    plt.figure(figsize=(9, 7.25))
    ax = plt.axes([0.05, 0.05, 0.84, 0.88], projection=ccrs.PlateCarree(central_longitude=180))
    ax.coastlines()
    ax.set_extent([115, 290, -70, 60])

    jj = np.flatnonzero(tempscores[:, j] >= np.quantile(tempscores[:, j], 0.9))
    plt.scatter(Y[jj, 1], Y[jj, 0], s=8, label="Top 10%", color="red", transform=ccrs.Geodetic(), rasterized=True)
    jj = np.flatnonzero(tempscores[:, j] <= np.quantile(tempscores[:, j], 0.1))
    plt.scatter(Y[jj, 1], Y[jj, 0], s=8, label="Bottom 10%", color="blue", transform=ccrs.Geodetic(), rasterized=True)

    ha,lb = plt.gca().get_legend_handles_labels()
    leg = plt.figlegend(ha, lb, loc="center right")
    leg.draw_frame(False)

    plt.title("%s (component %d)" % (title, j + 1))
    plt.show()

In [None]:
for j in range(3):
    plot_pc_map(j, tempscores, Y, "Temperature")

In [None]:
for j in range(3):
    plot_pc_map(j, psalscores, Y, "Salinity")

## Canonical Correlation Analysis

CCA looks specifically at how temperature and salinity are correlated.  It identifies linear projections $a^\prime Y$ and $b^\prime Z$ of the temperature data ($Y$) and salinity data ($Z$) that are maximally correlated.

In [None]:
def my_cca(X, Y):
    """
    CCA that agrees with R.  Returns loadings for X (u), loadings for Y (v), and eigenvalues (s).  The arguments
    have observations in their rows and variables in their columns.
    """
    n = X.shape[0]
    X = X - X.mean(0)
    Y = Y - Y.mean(0)
    Sx = np.dot(X.T, X) / n
    Sy = np.dot(Y.T, Y) / n
    Sxy = np.dot(X.T, Y) / n
    Rx = np.linalg.cholesky(Sx)
    Ry = np.linalg.cholesky(Sy)
    M = np.linalg.solve(Rx, Sxy)
    M = np.linalg.solve(Ry, M.T).T
    u, s, vt = np.linalg.svd(M)
    v = vt.T
    u = np.linalg.solve(Rx.T, u)
    v = np.linalg.solve(Ry.T, v)
    return u, v, s

We can try using standard CCA on the full 100-dimensional data.  But due to the high dimensionality and consequent overfitting, the results make little sense.

In [None]:
XX = temp.T.copy()
XX -= XX.mean(0)
YY = psal.T.copy()
YY -= YY.mean(0)
xc, yc, r = my_cca(XX, YY)
plt.clf()
plt.grid(True)
plt.plot(r)
plt.ylabel("Canonical correlations")

In [None]:
plt.grid(True)
plt.plot(pressure, xc[:,0], "-")
plt.xlabel("Pressure")
plt.ylabel("Temperature loading")

In [None]:
plt.grid(True)
plt.plot(pressure, yc[:,0], "-")
plt.xlabel("Pressure")
plt.ylabel("Salinity loading")

As in any factor analysis, basis directions are only identified up to sign.  To aid in interpretation, the function below flips the loadings so that the majority of the loadings are positive.

In [None]:
# Flip the CCA components as needed for interpretability
def flip(xc, yc):
    for j in range(xc.shape[1]):
        if (xc[:, j] > 0).mean() + (yc[:, j] > 0).mean() < 1:
            xc[:, j] *= -1
            yc[:, j] *= -1
    return xc, yc

To control the variance of working in high dimensions, we combine PCA and CCA.  To achieve this, we first reduce the temperature and salinity data by projecting to PCs, and then fit CCA to the projected data.  Finally, we map the loadings back to the original coordinates.  This is very similar to PCR but applied to CCA not to linear regression.

In [None]:
# The columns of ux and uy are the PC scores of the temperature and salinity data.
ux,sx,vtx = np.linalg.svd(XX, 0)
uy,sy,vty = np.linalg.svd(YY, 0)

In [None]:
for q in [1, 2, 5, 10, 20, 50]:

    # Do CCA after projecting the profiles to the top q PC's.
    xc, yc, r = my_cca(ux[:, 0:q], uy[:, 0:q])
    
    # Map the loadings back to the original coordinates
    xl1 = np.dot(vtx.T[:, 0:q], np.linalg.solve(np.diag(sx[0:q]), xc))
    yl1 = np.dot(vty.T[:, 0:q], np.linalg.solve(np.diag(sy[0:q]), yc))
    xl1, yl1 = flip(xl1, yl1)

    # The canonical correlations
    print("Canonical correlations (q={} PCs): {}".format(q, r))
    print(np.corrcoef(np.dot(XX, xl1[:, 0]), np.dot(YY, yl1[:, 0]))[0,1])

    # Plot the temperature loadings
    plt.clf()
    plt.axes([0.15, 0.1, 0.8, 0.8])
    plt.grid(True)
    plt.title("CCA/PCA using %d principal components, r=%.2f" % (q, r[0]))
    plt.plot(pressure, xl1[:, 0])
    if xl1[:, 0].min() > 0:
        plt.ylim(ymin=0)
    plt.xlabel("Pressure (dBar)", size=15)
    plt.ylabel("Temperature loading", size=15)
    plt.show()

    # Plot the salinity loadings
    plt.clf()
    plt.axes([0.15, 0.1, 0.8, 0.8])
    plt.title("CCA/PCA using %d principal components, r=%.2f" % (q, r[0]))
    plt.grid(True)
    plt.plot(pressure, yl1[:, 0])
    if yl1[:, 0].min() >= 0:
        plt.ylim(ymin=0)
    plt.xlabel("Pressure (dBar)", size=15)
    plt.ylabel("Salinity loading", size=15)
    plt.show()

Similar to what we did with PCA, we can plot the locations where profiles score at the high or low end of the range of scores.  These plots reveal that in a region of the south Pacific from Australia to Fiji, the temperatures are uniformly warmer and salinity is higher, whereas in the far northern/sourthern parts of the Pacific ocean, the temperatures are lower and salinity is lower.

In [None]:
def plot_cc_map(q=2):
    
    # Do CCA after projecting the profiles to the top q PC's.
    xc, yc, r = my_cca(ux[:, 0:q], uy[:, 0:q])

    plt.clf()
    plt.figure(figsize=(9, 7.25))
    ax = plt.axes([0.05, 0.05, 0.84, 0.88], projection=ccrs.PlateCarree(central_longitude=180))
    ax.coastlines()
    ax.set_extent([115, 290, -70, 60])

    xx = np.dot(ux[:, 0:q], xc)
    yy = np.dot(uy[:, 0:q], yc)
    j1 = xx[:, 0] >= np.quantile(xx[:, 0], 0.9)
    j2 = yy[:, 0] >= np.quantile(yy[:, 0], 0.9)
    jj = np.flatnonzero(j1 & j2)
 
    k1 = xx[:, 0] <= np.quantile(xx[:, 0], 0.1)
    k2 = yy[:, 0] <= np.quantile(yy[:, 0], 0.1)
    kk = np.flatnonzero(k1 & k2)
    
    plt.scatter(Y[jj, 1], Y[jj, 0], s=8, label="Top 10%", color="red", transform=ccrs.Geodetic(), rasterized=True)
    plt.scatter(Y[kk, 1], Y[kk, 0], s=8, label="Bottom 10%", color="blue", transform=ccrs.Geodetic(), rasterized=True)

    ha,lb = plt.gca().get_legend_handles_labels()
    leg = plt.figlegend(ha, lb, loc="center right")
    leg.draw_frame(False)
    plt.show()
    
    # Plot mean temperature profile +/- 1SD CC factor loading
    plt.clf()
    plt.grid(True)
    plt.plot(pressure, tempmean, "-", color="black")
    xl1 = np.dot(vtx.T[:, 0:q], np.linalg.solve(np.diag(sx[0:q]), xc))
    yl1 = np.dot(vty.T[:, 0:q], np.linalg.solve(np.diag(sy[0:q]), yc))
    xl1, yl1 = flip(xl1, yl1)
    xs = np.dot(XX, xl1) / (xl1**2).sum(0) # Regression estimate of the scores
    f = np.std(xs[:, 0])
    plt.plot(pressure, tempmean + f*xl1[:, 0], color="red")    
    plt.plot(pressure, tempmean - f*xl1[:, 0], color="blue")    
    plt.xlabel("Pressure (Dbar)")
    plt.ylabel("Temperature")
    plt.show()

    # Plot mean salinity profile +/- 1SD CC factor loading
    plt.clf()
    plt.grid(True)
    plt.plot(pressure, psalmean, "-", color="black")
    ys = np.dot(YY, yl1) / (yl1**2).sum(0) # Regression estimate of the scores
    f = np.std(ys[:, 0])
    plt.plot(pressure, psalmean + f*xl1[:, 0], color="red")    
    plt.plot(pressure, psalmean - f*xl1[:, 0], color="blue")    
    plt.xlabel("Pressure (Dbar)")
    plt.ylabel("Salinity")
    plt.show()

plot_cc_map()

## Dimension reduction regression

Below we use Sliced Inverse Regression to predict latitude from the first q principal components of the temperature data.  This shows us how temperature profiles differ with latitude.

In [None]:
q = 3 # Project to this number of PCs
m = SIR(lat, ux[:, 0:q])
r = m.fit()
cf = np.dot(vtx.T[:, 0:q], np.linalg.solve(np.diag(sx[0:q]), r.params))

Plot the SIR loadings.

In [None]:
plt.clf()
plt.grid(True)
for j in range(3):
    plt.plot(pressure, cf[:, j], "-", label="%d" % (j + 1))
ha, lb = plt.gca().get_legend_handles_labels()
leg = plt.figlegend(ha, lb, loc="center right")
leg.draw_frame(False)
plt.xlabel("Pressure", size=15)
plt.ylabel("SIR loading", size=15)

Plot the SIR scores against latitude, and smooth to estimate their conditional means.

In [None]:
scores = np.dot(XX, cf)
for j in range(3):
    plt.clf()
    plt.grid(True)
    plt.plot(lat, scores[:, j], "o", color="grey", alpha=0.3, rasterized=True)

    # Use lowess to estimate the conditional mean of the scores given latitude.
    # Lowess is slow and doesn't need all the data to give an accurate estimate.
    ii = np.random.choice(np.arange(scores.shape[0]), 2000, replace=False)
    m = lowess(scores[ii, j], lat[ii], frac=0.2)
    plt.plot(m[:, 0], m[:, 1], "-", color="orange")

    plt.xlabel("Latitude", size=15)
    plt.ylabel("Component %d SIR score" % (j + 1), size=15)
    plt.show()