# EOF decomposition of the altimetry data
- preparation for the Gaussian Mixture Model (GMM), which will be used to find regions of coherent sea level variability
- separating the sea level data set into its spatial and temporal component using empirical orthogonal function decomposition (or principal component analysis)
- **Input:**
    - output from s01_Processing; monthly mean altimetry data set with interpolated small gaps and for the selected time span and removed seasonal cycle; sea level, longitude, latitude, and time, saved as separate .npy files
- **Output:**
    - **Data:** EOF maps (spatial component), principal component time series (temporal component), explained variance of each EOF/PC, longitude, latitude, and time, saved as separate .npy files; explained variance saved as a .csv file
    - **Figures:** the first 12 EOF maps and PCs (3 figures with 4 EOF/PCs each)
    - saved to s02_PCA

- **Steps:**
    - load the data
    - PCA/EOF
    - plot the PCs and EOF maps
    - save the PCs, EOF maps, and explained variance
    - calculate the total explained variance up to n-th PC and save it to a .csv file
    
Author: Lea Poropat <br>
Last edited: 2023-12-07

### <font color = "red">Parameters</font>

In [1]:
# region name
reg = 'NWeuropeSeas'

# time span
y1 = 1995
y2 = 2021

# input file (it also needs the file name ending for data, Lon, and Lat)
filein='../Data/s01_Processed/altimetry_M_' + str(y1) + '_' + str(y2) + '_' + reg + '_'

# output files
fileout = r'../Data/s02_PCA/' + reg + '_' + str(y1) + '-' + str(y2) + '_'
figs = r'../Figures/s02_PCA/' + reg + '_' + str(y1) + '-' + str(y2) + '_'
expvarfile = r'../Data/s02_PCA/' + reg + '_' + str(y1) + '-' + str(y2) + '_explained_variance.csv'

### Import the libraries

In [2]:
import numpy as np
import pandas as pd
import netCDF4 as nc
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.io import savemat
import eofs.standard as es
from random import randint

### Loading the data

In [3]:
# loading the data
x = np.load(filein + 'interpolated_deseasoned.npy')
Lon = np.load(filein + 'Lon.npy')
Lat = np.load(filein + 'Lat.npy')
t = np.load(filein + 'time.npy')

nt, nlat, nlon = np.shape(x)

### PCA

In [4]:
solver = es.Eof(x)
eof_maps = solver.eofs()
PCs = solver.pcs(npcs = nt-1)   # columns are ordered Pcs; using nt-1 to easier remember what are rows and what columns
exp_var = solver.varianceFraction()

### Plotting

In [5]:
year = np.floor(t)

In [6]:
# plotting the first 8 PCs
nfigs = 3
nrow = 4
plt.rcParams['figure.figsize'] = [18, 12]

pc = 0
for i in range(nfigs):
    fig, ax = plt.subplots(nrow, 2, sharex = 'none', gridspec_kw={'width_ratios': [3, 1]})
    for j in range(nrow):
        # calculating the total explained variance by this and previous PCs
        vartot = sum(exp_var[:pc+1])
        
        # principal component
        im0 = ax[j][0].plot(t, PCs[:, pc])
        ax[j][0].set_xlim(t[0], t[-1])
        #ax[j][0].set_xticks(ticks = year)
        ax[j][0].text(0.98, 0.98, str(round(exp_var[pc]*100, 2))+' %' , ha='right', va='top', transform=ax[j][0].transAxes, fontsize = 20)
        ax[j][0].text(0.98, 0.02, 'Tot: '+ str(round(vartot*100, 2))+' %', ha = 'right', va = 'bottom', transform=ax[j][0].transAxes, fontsize = 20)
        ax[j][0].grid('on')
        
        # EOF map
        im1 = ax[j][1].pcolormesh(Lon,Lat,eof_maps[pc, :, :], shading = 'nearest')
        ax[j][1].text(0.02, 0.98, str(pc+1), ha='left', va='top', transform=ax[j][1].transAxes, fontsize = 20)
        plt.colorbar(im1, ax = ax[j][1])
        pc = pc + 1
    fig.savefig(figs + 'PCs' + str(i) + '.png')
    plt.close()

### Saving the new dataset

In [7]:
# results of the PCA
np.save(fileout + 'exp_var.npy', exp_var, allow_pickle = False)
np.save(fileout + 'PCs.npy', PCs, allow_pickle = False)
np.save(fileout + 'eof_maps.npy', eof_maps, allow_pickle = False) # in grid format

# longitude, latitude and time
np.save(fileout + 'Lon.npy', Lon, allow_pickle = False)
np.save(fileout + 'Lat.npy', Lat, allow_pickle = False)
np.save(fileout + 'time.npy', t, allow_pickle = False)

# results of the PCA and coordinates for matlab
savemat(fileout + 'PCs.mat', dict(Lon = Lon, Lat = Lat, t = t, PCs = PCs, eof_maps = eof_maps, exp_var = exp_var))

### Percentage of variance per PC

In [8]:
# calculate total variance explained by n-th PC
vartot = np.empty((len(exp_var)))
for i in range(len(exp_var)):
    vartot[i] = sum(exp_var[:i+1])
    
# convert that into a dataframe and save into csv
pc = np.arange(1, nt+1)
explained_variance = pd.DataFrame({'PC': pc, 'Exp_var': exp_var*100, 'Exp_var_tot': vartot*100})
explained_variance.to_csv(expvarfile, index = False, float_format = '%5.2f')

# print the info for the first 15 PCs
explained_variance.head(15)

Unnamed: 0,PC,Exp_var,Exp_var_tot
0,1,52.009716,52.009716
1,2,18.731792,70.741509
2,3,4.973238,75.714747
3,4,3.04797,78.762716
4,5,1.290898,80.053614
5,6,1.080096,81.13371
6,7,1.044656,82.178366
7,8,0.878597,83.056963
8,9,0.592732,83.649695
9,10,0.561415,84.21111
