In [None]:
# JM: 23 Dec 2021
# notebook to go through "basic" python and notebook things

# mantra of the course: If you have a code problem, try Google first. A large part of programing is
#                       experience, and you gain experience more efficiently by trying to fix code yourself.

import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [None]:
# time-series data
# by time-series data we just mean some data that depends on time
# an example here is the El Nino 3.4 data given in 02_reading_data_basic_manipulations

# the code below reads the data in the old-fashioned way to get the data as a 1d array
# Q. try do this in pandas instead

with open("elnino34_sst.data", "r") as f:
    elnino34_txt = f.readlines()
elnino34_txt = elnino34_txt[3:-4]  # strip out some unnecessary lines
for k in range(len(elnino34_txt)):
    elnino34_txt[k] = elnino34_txt[k].strip("\n")

# then we split each line (as a string) up into components
elnino34_txt[0].split()

# so we could define an empty list, cycle through each line, split, and add in the entries
# but skipping the first one if we only want the SST entries

elnino34_sst = []
for k in range(len(elnino34_txt)):           # this is the new elnino34_txt after stripping out some lines
    dummy = elnino34_txt[k].split()          # split out the entries per line
    for i in range(1, len(dummy)):           # cycle through the dummy list but skip the first entry
        elnino34_sst.append(float(dummy[i])) # turn string into a float, then add to list

elnino34_sst = np.array(elnino34_sst)

plt.plot(elnino34_sst)
plt.grid()

In [None]:
# but we do not have a time to plot it against
# so what you would see by reading the raw data itself is that this is monthly SST data from 1950 to 2019
# so in this case one can create a time array to plot the data against
# you could either 1) do this as an artifical raw array (you have to deal with units yourself)
#                  2) use the datetime64 object functionality
# below are code to do both

# 1) ad hoc: 1 Jan 1950 to 31 Dec 2019
#            use linspace, use length of the read in array, but leave out the last point
#            so "1950.00" correpsonds to the 1 Jan 1950
#               "2019.95" or something like that corresponds to 31 Dec 2019
#
# the result is an array that you can manipulate as usual

time_vec_raw = np.linspace(1950, 2019+1, len(elnino34_sst), endpoint=False)

# 2) datetime: this one is easier to read, but slightly less trivial to manipulate 
#              (which matters somewhat in 08_time_series)
#              in this case you don't specify the DATE and smaller units (syntax reasons)
#
# the end result is an array but instead of numbers it is data in the "datetime64" format
# syntax of "arange" is np.arange(start, end but not including, spacing), so need to the "end" slightly larger
time_vec = np.arange(np.datetime64('1950-01'), np.datetime64('2020-01'), np.timedelta64(1, 'M'))

fig = plt.figure(figsize=(10, 7))
ax = plt.subplot(2, 1, 1)
ax.plot(time_vec_raw, elnino34_sst)
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.grid()

ax = plt.subplot(2, 1, 2)
ax.plot(time_vec, elnino34_sst)
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.grid()

# if you wanted to create things in smaller units (e.g. days), you might do
# time_vec = np.arange(np.datetime64('1950-01-01'), np.datetime64('1950-12-31'), np.timedelta64(1, 'd'))
# look up online the relevant syntax

# we will come back to the El-Nino data later in the exercises and in 08_time_series

In [None]:
# some time-series manipulations

# create and use an artificial one for now for making a point

time_vec = np.arange(np.datetime64('2020-01-01'), np.datetime64('2021-12-31'), np.timedelta64(6, 'h'))

nt = len(time_vec)
t_vec = np.linspace(0, 2.0 * np.pi, nt)
lin_trend = 0.05 * np.linspace(0, 2.0 * np.pi, nt)

noise = 0.2 * np.random.rand(nt)
f_vec = (  2.7 
         + 0.1 * np.sin(t_vec) 
         + 0.05 * np.sin(4.0 * t_vec) 
         + 0.02 * np.sin(60.0 * t_vec) 
         + lin_trend 
         + noise
        )

fig = plt.figure(figsize=(10, 3))
ax = plt.axes()
ax.plot(time_vec, f_vec, 'C0-')
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()

In [None]:
# too much data, there looks like a trend, but how to pick it out?
# one way is to just brute force downsize

fig = plt.figure(figsize=(10, 3))
ax = plt.axes()
ax.plot(time_vec[::20], f_vec[::20], 'C0-')  # Q: what does this do?
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()

In [None]:
# but the above is losing way too much information, and also not really filtering out signals
# consider averaging data over a window (uniform weighting)
#    (need to chop off some entries at the edges to have the same array length to plot)

window = 60                                             # specify a window (number of entries)
f_vec_uni_avg = np.zeros(len(f_vec[window:-window:]))   # final array with edges chopped off
for i in range(len(f_vec[window:-window:])):
    f_vec_uni_avg[i] = np.mean(f_vec[i:i+window:])      # uniform average over a window
    
fig = plt.figure(figsize=(10, 3))
ax = plt.axes()
ax.plot(time_vec[window:-window:], f_vec_uni_avg, 'C0-')  # Q: what does this do?
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()

# notice this averages out the fast fluctuations without down-sampling as such 
# but also the values have decreased in magnitude (because of the averaging)

In [None]:
# consider averaging data over a window (tent-like weighting, purely artificial)
#    (need to chop off some entries at the edges to have the same array length to plot)

window = 60                                                        # specify a window (number of entries)
f_vec_tent_avg = np.zeros(len(f_vec[window:-window:]))             # final array with edges chopped off

tent_kernel  = -np.abs(np.arange(window)+0.5 - window/2.0)         # make a straight line and bend it into a v
tent_kernel -= np.min(tent_kernel)                                 # flip the v upside down
tent_kernel /= np.max(tent_kernel) / 2.0                           # normalise such that kernel itself sums to 1

for i in range(len(f_vec[window:-window:])):
    f_vec_tent_avg[i] = np.mean(tent_kernel * f_vec[i:i+window:])  # average over a window with weighting
    
fig = plt.figure(figsize=(14, 3))
ax = plt.subplot2grid((1, 3), (0, 0), colspan=2)
ax.plot(time_vec[window:-window:], f_vec_tent_avg, 'C0-')         # Q: what does this do?
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()

ax = plt.subplot2grid((1, 3), (0, 2), colspan=1)
ax.plot(tent_kernel, 'C1-')  # Q: what does this do?
ax.set_xlabel(r"index")
ax.set_ylabel(r"kernel shape")
ax.set_ylim([0.0, 2.0])
ax.grid()

# fairly minor differences in this case

In [None]:
# consider averaging data over a window (Gaussian kernel with deconvolution)
#    could write this out raw but just going to call a pacakge here...

from scipy.ndimage import filters

# here the value to specify is sigma (what it means somewhat determines on the kernel shape)

sigma = 2.0
f_vec_gauss_avg = filters.gaussian_filter1d(f_vec, sigma)

fig = plt.figure(figsize=(10, 7))
ax = plt.subplot(2, 1, 1)
ax.plot(time_vec, f_vec_gauss_avg, 'C0-', label=r"$\sigma = %.1f$" % sigma)
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()
ax.legend()

sigma = 10.0
f_vec_gauss_avg = filters.gaussian_filter1d(f_vec, sigma)
ax = plt.subplot(2, 1, 2)
ax.plot(time_vec, f_vec_gauss_avg, 'C0-', label=r"$\sigma = %.1f$" % sigma)
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"data")
ax.set_ylim([2.7, 3.2])
ax.grid()
ax.legend()

# Q: why is the latter more smooth?
# Q: why does this not need time_vec to be reduced in size?

In [None]:
# trends
# most of the techniques we have played with in the previous workshops would work here too
# we could for example do linear regression to pick out a linear trend

# creating the artificial time-series again
time_vec = np.arange(np.datetime64('2020-01-01'), np.datetime64('2021-12-31'), np.timedelta64(6, 'h'))
nt = len(time_vec)
t_vec = np.linspace(0, 2.0 * np.pi, nt)
lin_trend = 0.05 * np.linspace(0, 2.0 * np.pi, nt)  # this is related to the answer we are looking for
noise = 0.2 * np.random.rand(nt)
f_vec = (  2.7                                      # this is related to the answer we are looking for
         + 0.1 * np.sin(t_vec) 
         + 0.05 * np.sin(4.0 * t_vec) 
         + 0.02 * np.sin(60.0 * t_vec) 
         + lin_trend 
         + noise
        )

# could use the filtered one too, in this case probably doesn't matter
f_vec_gauss_avg = filters.gaussian_filter1d(f_vec, 10)

p_orig  = np.polyfit(t_vec, f_vec, 1)
p_gauss = np.polyfit(t_vec, f_vec_gauss_avg, 1)

fig = plt.figure(figsize=(10, 7))
ax = plt.subplot(2, 1, 1)
ax.plot(time_vec, f_vec)
ax.plot(time_vec, p_orig[0] * t_vec + p_orig[1], 'k--',  # regressed linear trend
        label=f"${{{p_orig[0]:.3f}}} t + {{{p_orig[1]:.3f}}}$")  
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.grid()
ax.legend()

ax = plt.subplot(2, 1, 2)
ax.plot(time_vec, f_vec_gauss_avg)
ax.plot(time_vec, p_gauss[0] * t_vec + p_gauss[1], 'k--',  # regressed linear trend
        label=f"${{{p_gauss[0]:.3f}}} t + {{{p_gauss[1]:.3f}}}$")  
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.grid()
ax.legend()

# Q. the real second coefficient should be 2.7, so is the regression reproducing that?
# Q. in the real linear trend we added the amplitude we gave was 0.05, but regression
#    is returning a linear coefficient round 0.015
#    the answer is actually correct, but what is the reason for the (apparent) discrepancy?
#       hint: look at the t_vec window I used (not "time_vec", since this time-series depends solely on "t_vec"

In [None]:
# correlations

# so you know how to do calculate correlation coefficients from 03 and 04
# here you might have two time-series and you want to see if they are correlated
#                     the same time-series and if it is correlated with itself in some way

# lets start with a very simple example where we should have a good feel for the answer before computing

t_vec   = np.linspace(0, 2.0 * np.pi, 31)
f       =   np.sin(t_vec)
f_pos   = 2*np.sin(t_vec)
f_neg   =  -np.sin(t_vec)
f_shift =   np.sin(t_vec - np.pi / 2.0)

fig = plt.figure(figsize=(5, 3))
ax = plt.axes()
ax.plot(t_vec, f    ,   "C0", label="f")
ax.plot(t_vec, f_pos,   "C1", label="f pos") 
ax.plot(t_vec, f_neg,   "C2", label="f neg")
ax.plot(t_vec, f_shift, "C3", label="f shift")
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"$f$")
ax.grid()
ax.legend()

# Q. what correlations do you expect relative to f?

In [None]:
# compute the correlations
# might be an idea to plot the correlations as a scatter graph

fig = plt.figure(figsize=(10, 3))
ax1 = plt.subplot(1, 3, 1)
ax1.scatter(f, f_pos, color="C1")
ax1.set_xlabel(r"f")
ax1.set_ylabel(r"f pos")
ax1.grid()

ax2 = plt.subplot(1, 3, 2)
ax2.scatter(f, f_neg, color="C2")
ax2.set_xlabel(r"f")
ax2.set_ylabel(r"f neg")
ax2.grid()

ax3 = plt.subplot(1, 3, 3)
ax3.scatter(f, f_shift, color="C3")
ax3.set_xlabel(r"f")
ax3.set_ylabel(r"f shift")
ax3.grid()

# so we should get 1, -1 and probably 0

In [None]:
# now lets see what the computation from package tells us
# going to use scipy here, but scikit-learn would work (see 03)
# (you can write one yourself too, you only need to covariance and the standard deviations of the data)

_, _, r_pos, _, _ = stats.linregress(f, f_pos)
_, _, r_neg, _, _ = stats.linregress(f, f_neg)
_, _, r_shift, _, _ = stats.linregress(f, f_shift)

print(f"f and f_pos   has (linear/Pearson) correlation coefficient of {r_pos:.2f}")
print(f"f and f_neg   has (linear/Pearson) correlation coefficient of {r_neg:.2f}")
print(f"f and f_shift has (linear/Pearson) correlation coefficient of {r_shift:.2f}")

In [None]:
# cross/lag correlation
#
# so lets go back to the shifted example but extend it a bit

t_vec   = np.linspace(0, 4.0 * np.pi, 61)
f       =   np.sin(t_vec)
f_shift =   np.sin(t_vec - np.pi / 2.0)

_, _, r_shift, _, _ = stats.linregress(f, f_shift)

fig = plt.figure(figsize=(8, 3))
ax = plt.axes()
ax.plot(t_vec, f    ,   "C0", label="f")
ax.plot(t_vec, f_shift, "C3", label="f shift")
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"$f$")
ax.grid()
ax.legend()
ax.set_title(f"(linear/Pearson) correlation coefficient of {r_shift:.2f}")

# the corrleation coeffcient is zero, but from the graph it is clear that there are correlations between
# the two signals (this is one of those cases where blindly applying statistics to data can give you a 
# misleading conclusion)

In [None]:
# cross/lag correlation
#
# in this example I cooked up the signal is just a shift, so what you might expect is that as I calculate
# the correlation of the SHIFTED signal relative to each other, you would start to see correlation go up
# eventually to 1

# so, for example

lag = 2  # lag by 5 indices

# chop off the first few entries of the shift signal, last few entries of the original signal
# (so that the two arrays are the same size and we can compute a correlation coefficient)
_, _, r_lag, _, _ = stats.linregress(f[:-lag:], f_shift[lag::])

fig = plt.figure(figsize=(8, 3))
ax = plt.axes()
ax.plot(t_vec[:-lag:], f      [:-lag:]    ,   "C0", label="f")
ax.plot(t_vec[:-lag:], f_shift[lag::], "C3", label="f shift with lag")
ax.plot(t_vec, f_shift, "C3--", label="f shift orig", alpha=0.5)
ax.set_xlabel(r"$t$")
ax.set_ylabel(r"$f$")
ax.grid()
ax.legend()
ax.set_title(f"lag = {lag}, correlation coefficient of {r_lag:.2f}")

In [None]:
# cross/lag correlation
#
# I can wrap this up in a subroutine and do this as a function of lag

def custom_lag_corr(signal1, signal2, lag):
    if len(signal1) != len(signal2):
        raise Exception("array size not equal, cannot continue")

    if lag == 0:
        _, _, r, _, _ = stats.linregress(signal1, signal2)
    else:
        _, _, r, _, _ = stats.linregress(signal1[:-lag:], signal2[lag::])
    
    return r

n = 30
r_lag = np.zeros(n)
for lag in range(n):
    r_lag[lag] = custom_lag_corr(f, f_shift, lag)

fig = plt.figure(figsize=(8, 3))
ax = plt.axes()
ax.plot(np.arange(n), r_lag, "C0-x")
ax.set_xlabel(r"lag (in index)")
ax.set_ylabel(r"$r$")
ax.grid()

# if n is too big, the number of samples to calculate correlation gets low, so it increasingly becomes
# a statistically dodgy manoeuvre (hence why I extend the signal a little bit more above)

In [None]:
# cross/lag correlation

# from the graph above you see that the signal has peak correlation at somewhere between lag index 7 and 8
# (and the corresponding anti-correlation further down the line)
# so in principle we could turn the lag index into a "time" (or whatever other measure you like 
# depending on context), essentially by finding out what "lag = 1" corresponds to in "time"

# in this case I know the t_vec is uniform in time, so I just work out the differences and pick the first one
# (if it isn't you have to associate each index with it's own time, which is doable but not touched on here)
dt = np.diff(t_vec)[0]

# again, I know the answer here: f_shift is shifted quarter wavelength out (by pi/2) relative to f
#   so if I shift f_shift another quarter wavelength (  pi/2) then I should get maximum correlation
#   so if I shift f_shift  three quarters wavelength (3 pi/2) then I should get minimum correlation

fig = plt.figure(figsize=(8, 3))
ax = plt.axes()
ax.plot(np.arange(n) * dt, r_lag, "C0-x")
ax.plot([np.pi / 2, np.pi / 2], [-2, 2], "k--", alpha=0.7)          # theoretical maximum
ax.plot([3 * np.pi / 2, 3 * np.pi / 2], [-2, 2], "k--", alpha=0.7)  # theoreical minimum
ax.set_xlabel(r"lag (in time units)")
ax.set_ylabel(r"$r$")
ax.set_ylim([-1.1, 1.1])
ax.grid()

# add the tick labels in
xt = ax.get_xticks() 
xt = np.append(xt, [np.pi/2, 3*np.pi/2])
xtl= xt.tolist()
xtl[-2]=r"$\pi/2$"
xtl[-1]=r"$3\pi/2$"
ax.set_xticks(xt)
ax.set_xticklabels(xtl)
ax.set_xlim([0, 6]);

In [None]:
# auto-correlation

# if you can do lag correlations for two signals you could also do it for the signal with respect to itself
#   (which, if you think about it, is what I actually did above...)
# this is called the auto-correlation (correlation of signal with respect to lagged versions of itself)
#
# so you do this if you are interested to see how the signal correlates with itself, and whether you could
#   use the signal's previous values to predict its future values

# trivial example

t_vec = np.linspace(0, 2.0 * np.pi, 31)
f     = np.sin(t_vec)
dt = np.diff(t_vec)[0]

fig = plt.figure(figsize=(16, 3))

n = 15
r_lag = np.zeros(n)
for lag in range(n):
    r_lag[lag] = custom_lag_corr(f, f, lag)
    
ax = plt.subplot(1, 2, 1)
ax.plot(np.arange(n) * dt, r_lag, "C0-x")
ax.set_xlabel(r"lag (in index)")
ax.set_ylabel(r"$r$")
ax.set_title(f"lag up to index {n}")
ax.grid()

n = 30
r_lag = np.zeros(n)
for lag in range(n):
    r_lag[lag] = custom_lag_corr(f, f, lag)
    
ax = plt.subplot(1, 2, 2)
ax.plot(np.arange(n) * dt, r_lag, "C0-x")
ax.set_xlabel(r"lag (in index)")
ax.set_ylabel(r"$r$")
ax.set_title(f"lag up to index {n}")
ax.grid()

# Q. is the left panel what you expect for the appropriate shifts
# Q. given this is a sine/cosing curve, the correlations should be somewhat symmetric (see graph in cell above)
#    but the right panel is not symmetric about the minimum point, why? (hint: what is the size of array?)

In [None]:
# auto-correlation

# and of course there is a package that someone coded up already we could have used
# (the "statsmodel" module might not exist on your computers; you can download it yourself)
# 
# anaconda: conda install -c conda-forge statsmodels
# pip:      pip install statsmodels

t_vec = np.linspace(0, 2.0 * np.pi, 31)
f     = np.sin(t_vec)
dt = np.diff(t_vec)[0]

from statsmodels.graphics.tsaplots import plot_acf  # only gives the plotting
# (or if you want access to the actual data etc., it is in "statsmodels.tsa.stattools.acf")

fig = plt.figure(figsize=(10, 4))
ax = plt.axes()
title_str = f"Auto-correlation, 1 lag index $\leftrightarrow\ dt = {dt:.3f}$"
plot_acf(f, ax=ax, lags=30, adjusted=True, title=title_str)
ax.set_xlabel(f"lag (index)")
ax.set_ylabel(f"acf")
ax.grid()

# this package gives you more information
#    when "adjusted=True", the lines with dot ends are basically the same as above
#    when "adjusted=False", it accounts for loss of data by the lag, and weights the auto-correlation accordingly
#       -- so if you either leave out the argument or set it explicitly to False, the acf decays in time
#    the envelope is the 95% confidence interval at alpha=0.05
#       -- you can adjust alpha by providing e.g. "alpha=0.01" as a keyword
#    when the dots lie within the confidence interval it is saying there is no strong statistical evidence
#      to say values at the lag influence the current value (see 05 and 06)
#       -- for this example it's saying if you know maybe 3 or 4 values then you can probably do a
#          reasonable job predicting the next value
#       -- remember this just says there is no strong statistical evidence, it doesn't mean there is no relation
#          (in this artificial example there is in fact a strong relation but the statistics isn't picking it up)

In [None]:
# now lets try to do something with more realistic data (El-Nino 3.4)
# reading code below is just the same as above

with open("elnino34_sst.data", "r") as f:
    elnino34_txt = f.readlines()
elnino34_txt = elnino34_txt[3:-4]
for k in range(len(elnino34_txt)):
    elnino34_txt[k] = elnino34_txt[k].strip("\n")

elnino34_txt[0].split()

elnino34_sst = []
for k in range(len(elnino34_txt)):           # this is the new elnino34_txt after stripping out some lines
    dummy = elnino34_txt[k].split()          # split out the entries per line
    for i in range(1, len(dummy)):           # cycle through the dummy list but skip the first entry
        elnino34_sst.append(float(dummy[i])) # turn string into a float, then add to list

elnino34_sst = np.array(elnino34_sst)

# I want to do sums on this so I am going to use the raw version 
# (I personally find the numbers easier to manipulate)
t_vec = np.linspace(1950, 2019+1, len(elnino34_sst), endpoint=False)

In [None]:
# El-Nino 3.4 linear trend
# be careful here that time units are in YEARS
p = np.polyfit(t_vec, elnino34_sst, 1)
lin_trend = p[0] * t_vec + p[1]

fig = plt.figure(figsize=(10, 3))
ax = plt.axes()
ax.plot(t_vec, elnino34_sst, 'C0')
ax.plot(t_vec, lin_trend, 'k--')
ax.text(1990, 24.5, f"trend = ${p[0]:.3f}^{{\circ}}\ \mathrm{{C}}$ per year", color="k")
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"SST (${}^{\circ}\mathrm{C}$)")
ax.set_ylim(24, 30)
ax.grid()

# Q. What does the trend mean here? Is this consistent with what is know? (you might need to look this up)

In [None]:
# El-Nino 3.4 filtering
# if we are interested in the longer term oscillations, we might want to get rid of the higher
# frequencies, so lets apply a filter to the signal

fig = plt.figure(figsize=(10, 3))
ax = plt.axes()

sigma_vec = [2.0, 5.0, 10.0]

for sigma in sigma_vec:
    elnino34_gauss = filters.gaussian_filter1d(elnino34_sst, sigma)
    ax.plot(t_vec, elnino34_gauss, label=f"$\sigma = {sigma}$")
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"SST (${}^{\circ}\mathrm{C}$)")
ax.set_ylim(24, 30)
ax.grid()
ax.legend()

# smaller sigma smooths out the signal a bit
# larger sigma even gets rid of the shorter oscillations but keeps the longer ones (up to a point)
#
# Q. low pass the signal for a specified window of 6 months, 2 years and 10 years, and describe signal
#    do this for both weighted and not weighted (e.g. tent kernel) options

In [None]:
# El-Nino 3.4 auto-correlation (unadjusted here)
# when do the El-Nino 3.4 SST start decorrelating according to the auto-correlation analysis?

dt = np.diff(t_vec)[0]

from statsmodels.graphics.tsaplots import plot_acf  # only gives the plotting
# (or if you want access to the actual data etc., it is in "statsmodels.tsa.stattools.acf")

fig = plt.figure(figsize=(10, 4))
ax = plt.axes()
title_str = f"Auto-correlation of El-Nino 3.4, 1 lag index $\leftrightarrow\ dt = 1$ month"
plot_acf(elnino34_sst, ax=ax, lags=24, title=title_str, alpha=0.05)
ax.set_xlabel(f"lag (index)")
ax.set_ylabel(f"acf")
ax.grid()

# Q. how do the acf's vary if you low-pass the signal (e.g. do you gain/lose "predictability")?
# Q. (more involved) the acf above (probably?) computes the average acf over all points, 
#    so gives an average sense of how many previous points you need to reliably "predict" (?) the current point
#    1) suppose you don't do that, and just compute the autocorrelation (look up the formula or a package)
#       by only giving it truncated signal which has a date associated with it, do the results differ, and if so
#       by how much? (i.e. the acf package gives you the average, but look into the samples itself)
#    2) are some months more predictable?
# Q. (more involved) the years with particularly large SSTs (being a bit vague here) are regarded as El-Nino years
#    1) make up a way to pick these YEARS out from the time-series, and compare what the code returns with
#       known El-Nino years
#    2) are some El-Nino years "more predictable"? (i.e. do something like the previous Q.)

In [None]:
# El-Nino 3.4 SST correlation with some other time-series data from this region
# load some data diagnosed from the El-Nino 3.4 region (processed from data taken from copernicus.eu)
# in this case this is also monthly data, but for averaged chlorophyll (mg/m3) from Jan 1993 to Dec 2020
# (recall El-Nino 3.4 SST here is from Jan 1950 to Dec 2019)

with open("elnino34_bgc.data", "r") as f:
    elnino34_txt = f.readlines()
elnino34_txt = elnino34_txt[1::]  # strip out some unnecessary lines
for k in range(len(elnino34_txt)):
    elnino34_txt[k] = elnino34_txt[k].strip("\n")

elnino34_chl = []
for k in range(len(elnino34_txt)):
    # split out the entries per line, pick out the 3rd entry, and strip out the floating comma, turn into float
    elnino34_chl.append(float(elnino34_txt[k].split()[2].strip(",")))
elnino34_chl = np.asarray(elnino34_chl)

# create the analogous t_vec for this data
t_vec_chl = np.linspace(1993, 2020+1, len(elnino34_chl), endpoint=False)

# plot the SST and chlorophyll concentration out over the same time axis for comparison
fig = plt.figure(figsize=(10, 7))
ax = plt.subplot(2, 1, 1)
ax.plot(t_vec, elnino34_sst)
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.set_xlim([1948, 2022])
ax.grid()

ax = plt.subplot(2, 1, 2)
ax.plot(t_vec_chl, elnino34_chl, "C2")
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"chl-$a$ ($\mathrm{mg}\ \mathrm{m}^3$)")
ax.set_xlim([1948, 2022])
ax.grid()

In [None]:
# El-Nino 3.4 SST correlation with some other time-series data from this region
# pull out data from the same time window to compute cross-correlations

sst_window = (t_vec >= 1993) & (t_vec <= 2019)
chl_window = (t_vec_chl >= 1993) & (t_vec_chl <= 2019)

fig = plt.figure(figsize=(10, 7))
ax = plt.subplot(2, 1, 1)
ax.plot(t_vec[sst_window], elnino34_sst[sst_window])
ax.set_ylabel(r"SST (${}^\circ\mathrm{C}$)")
ax.set_xlim([1990, 2022])
ax.grid()

ax = plt.subplot(2, 1, 2)
ax.plot(t_vec_chl[chl_window], elnino34_chl[chl_window], "C2")
ax.set_xlabel(r"$t$ (years)")
ax.set_ylabel(r"chl-$a$ ($\mathrm{mg}\ \mathrm{m}^3$)")
ax.set_xlim([1990, 2022])
ax.grid()

# linear regression to get the correlation coefficient
_, _, r, _, _ = stats.linregress(elnino34_sst[sst_window], elnino34_chl[chl_window])

print(f"cross correlation (un-lagged) between SST and chl-a is {r:.6f}")

# Q. interpret the correlation
# Q. try a lag analysis on these signals accordingly and see if anything interesting comes out
# Q. try the above, but for low-passed signals (explore the choice of time window)
# Q. (harder) are the relevant lag correlations (if any) consistent with physical rationale for El-Nino,
#             particularly during El-Nino years? (you may have to look up which ones these are)
# Q. (involved) the dataset just loaded also includes phytoplankton concentration, try and do the analogous
#               analyses for the various pairs of data (SST, chl-a, phytoplankton)

In [None]:
# Q. (involved) the code here is dirty (no apologies for that actually) and relies a lot on native python commands,
#               particularly when dates are involved etc. Try do the same thing thus far but using pandas 
#               (which would be much cleaner for managing the data) and python packages. 
#
#       Several things you might want to do:
#          1) put everything into one pandas dataframe so there aren't multiple arrays hanging around
#          2) there is a time-mismatch, so to have only one time dimension, either
#             i ) get rid of some data in SST
#             ii) fill out the shorter array with NaNs or missing values
#          3) there are some commands in pandas that might be useful 
#             e.g. .mean, .sum, .rolling, etc., look these up on Google