## 45. ガウス過程（GP : Gaussian Process）

In [None]:
!pip install GPy

!pip install gpflow

In [None]:
## サンプルコード置き場 : https://github.com/GPflow/docs/tree/develop/doc/source/notebooks
# （colaboから読み込み可能）

# ファイル -> ノートブックを開く -> 「GitHub」タブを選択
#   -> 「GPflow」で検索
#     -> レポジトリ ： GPflow/docs    ブランチ : develop    をそれぞれ選択

### <font color=blue>**1.** </font> ガウス過程回帰（GPR）

#### <font color=green>**1.1.** </font> Gaussian Processes regression: basic introductory example

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_noisy_targets.html

# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
#         Jake Vanderplas <vanderplas@astro.washington.edu>
#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>s
# License: BSD 3 clause

In [None]:
import numpy as np
from matplotlib import pyplot as plt

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

np.random.seed(1)

In [None]:
def f(x):
  """The function to predict."""
  return x * np.sin(x)

In [None]:
# ----------------------------------------------------------------------
#  First the noiseless case
X1 = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T

# Observations
y1 = f(X1).ravel()

# Mesh the input space for evaluations of the real function, the prediction and
# its MSE
x = np.atleast_2d(np.linspace(0, 10, 1000)).T

In [None]:
# Instantiate a Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp1 = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp1.fit(X1, y1)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred1, sigma1 = gp1.predict(x, return_std=True)

In [None]:
# Plot the function, the prediction and the 95% confidence interval based on
# the MSE
plt.figure(figsize=(12,8))  ###
plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
plt.plot(X1, y1, 'r.', markersize=10, label='Observations')
plt.plot(x, y_pred1, 'b-', label='Prediction')
plt.fill(np.concatenate([x, x[::-1]]),
         np.concatenate([y_pred1 - 1.9600 * sigma1,
                        (y_pred1 + 1.9600 * sigma1)[::-1]]),
         alpha=.5, fc='b', ec='None', label='95% confidence interval')
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.ylim(-10, 15) ###
plt.legend(loc='upper left')
plt.show()

In [None]:
# ----------------------------------------------------------------------
# now the noisy case
X2 = np.linspace(0.1, 9.9, 20)
X2 = np.atleast_2d(X2).T

# Observations and noise
y2 = f(X2).ravel()
dy = 0.5 + 1.0 * np.random.random(y2.shape)
noise = np.random.normal(0, dy)
y2 += noise

In [None]:
# Instantiate a Gaussian Process model
gp2 = GaussianProcessRegressor(kernel=kernel, alpha=dy ** 2,
                              n_restarts_optimizer=10)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp2.fit(X2, y2)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred2, sigma2 = gp2.predict(x, return_std=True)

In [None]:
# Plot the function, the prediction and the 95% confidence interval based on
# the MSE
plt.figure(figsize=(12,8))  ###
plt.plot(x, f(x), 'r:', label=r'$f(x) = x\,\sin(x)$')
plt.errorbar(X2.ravel(), y2, dy, fmt='r.', markersize=10, label='Observations')
plt.plot(x, y_pred2, 'b-', label='Prediction')
plt.fill(np.concatenate([x, x[::-1]]),
         np.concatenate([y_pred2 - 1.9600 * sigma2,
                        (y_pred2 + 1.9600 * sigma2)[::-1]]),
         alpha=.5, fc='b', ec='None', label='95% confidence interval')
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.ylim(-10, 15) ###
plt.legend(loc='upper left')

plt.show()

#### <font color=green>**1.2.** </font> Gaussian process regression (GPR) with noise-level estimation

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_noisy.html

# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
#
# License: BSD 3 clause

In [None]:
import numpy as np

from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

In [None]:
rng = np.random.RandomState(0)
X = rng.uniform(0, 5, 20)[:, np.newaxis]
y = 0.5 * np.sin(3 * X[:, 0]) + rng.normal(0, 0.5, X.shape[0])

In [None]:
# First run
plt.figure(figsize=(12,8))  ###
kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
gp = GaussianProcessRegressor(kernel=kernel,
                              alpha=0.0).fit(X, y)
X_ = np.linspace(0, 5, 100)
y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
                 y_mean + np.sqrt(np.diag(y_cov)),
                 alpha=0.5, color='k')
plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
          % (kernel, gp.kernel_,
             gp.log_marginal_likelihood(gp.kernel_.theta)))
plt.tight_layout()
plt.show()

In [None]:
# Second run
plt.figure(figsize=(12,8))  ###
kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
    + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-10, 1e+1))
gp = GaussianProcessRegressor(kernel=kernel,
                              alpha=0.0).fit(X, y)
X_ = np.linspace(0, 5, 100)
y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
plt.fill_between(X_, y_mean - np.sqrt(np.diag(y_cov)),
                 y_mean + np.sqrt(np.diag(y_cov)),
                 alpha=0.5, color='k')
plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
plt.title("Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s"
          % (kernel, gp.kernel_,
             gp.log_marginal_likelihood(gp.kernel_.theta)))
plt.tight_layout()
plt.show()

In [None]:
# Plot LML landscape
plt.figure(figsize=(12,8))  ###
theta0 = np.logspace(-2, 3, 49)
theta1 = np.logspace(-2, 0, 50)
Theta0, Theta1 = np.meshgrid(theta0, theta1)
LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i, j], Theta1[i, j]]))
        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
LML = np.array(LML).T

vmin, vmax = (-LML).min(), (-LML).max()
vmax = 50
level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1)
plt.contour(Theta0, Theta1, -LML,
            levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
plt.colorbar()
plt.xscale("log")
plt.yscale("log")
plt.xlabel("Length-scale")
plt.ylabel("Noise-level")
plt.title("Log-marginal-likelihood")
plt.tight_layout()

plt.show()

#### <font color=green>**1.3.** </font> Comparison of kernel ridge and Gaussian process regression

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_compare_gpr_krr.html

# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD 3 clause

In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared

In [None]:
rng = np.random.RandomState(0)

# Generate sample data
X = 15 * rng.rand(100, 1)
y = np.sin(X).ravel()
y += 3 * (0.5 - rng.rand(X.shape[0]))  # add noise

In [None]:
# Fit KernelRidge with parameter selection based on 5-fold cross validation
param_grid = {"alpha": [1e0, 1e-1, 1e-2, 1e-3],
              "kernel": [ExpSineSquared(l, p)
                         for l in np.logspace(-2, 2, 10)
                         for p in np.logspace(0, 2, 10)]}
kr = GridSearchCV(KernelRidge(), param_grid=param_grid)
stime = time.time()
kr.fit(X, y)
print("Time for KRR fitting: %.3f" % (time.time() - stime))

In [None]:
gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1)) \
    + WhiteKernel(1e-1)
gpr = GaussianProcessRegressor(kernel=gp_kernel)
stime = time.time()
gpr.fit(X, y)
print("Time for GPR fitting: %.3f" % (time.time() - stime))

In [None]:
# Predict using kernel ridge
X_plot = np.linspace(0, 20, 10000)[:, None]
stime = time.time()
y_kr = kr.predict(X_plot)
print("Time for KRR prediction: %.3f" % (time.time() - stime))

In [None]:
# Predict using gaussian process regressor
stime = time.time()
y_gpr = gpr.predict(X_plot, return_std=False)
print("Time for GPR prediction: %.3f" % (time.time() - stime))

In [None]:
stime = time.time()
y_gpr, y_std = gpr.predict(X_plot, return_std=True)
print("Time for GPR prediction with standard-deviation: %.3f"
      % (time.time() - stime))

In [None]:
# Plot results
plt.figure(figsize=(12, 8)) ###
lw = 2
plt.scatter(X, y, c='k', label='data')
plt.plot(X_plot, np.sin(X_plot), color='navy', lw=lw, label='True')
plt.plot(X_plot, y_kr, color='turquoise', lw=lw,
         label='KRR (%s)' % kr.best_params_)
plt.plot(X_plot, y_gpr, color='darkorange', lw=lw,
         label='GPR (%s)' % gpr.kernel_)
plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='darkorange',
                 alpha=0.2)
plt.xlabel('data')
plt.ylabel('target')
plt.xlim(0, 20)
plt.ylim(-4, 4)
plt.title('GPR versus Kernel Ridge')
plt.legend(loc="best",  scatterpoints=1, prop={'size': 8})
plt.show()

#### <font color=green>**1.4.** </font> Basic (Gaussian likelihood) GP regression model


We focus here on the implementation of the models in GPflow; for more intuition on these models, see [A Practical Guide to Gaussian Processes](https://drafts.distill.pub/gp/) and [A Visual Exploration of Gaussian Processes](https://distill.pub/2019/visual-exploration-gaussian-processes/).

In [None]:
import gpflow
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from gpflow.utilities import print_summary

# The lines below are specific to the notebook format
%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 6)

`X` and `Y` denote the input and output values. **NOTE:** `X` and `Y` must be two-dimensional NumPy arrays, $N \times 1$ or $N \times D$, where $D$ is the number of input dimensions/features, with the same number of rows as $N$ (one for each data point):

In [None]:
data = np.genfromtxt("https://raw.githubusercontent.com/GPflow/docs/develop/doc/source/notebooks/basics/data/regression_1D.csv", delimiter=",")
X = data[:, 0].reshape(-1, 1)
Y = data[:, 1].reshape(-1, 1)

_ = plt.plot(X, Y, "kx", mew=2)

We will consider the following probabilistic model:
\begin{equation}
Y_i = f(X_i) + \varepsilon_i\,,
\end{equation}
where $f \sim \mathcal{GP}(\mu(\cdot), k(\cdot, \cdot'))$, and $\varepsilon \sim \mathcal{N}(0, \tau^2 I)$.

##### Choose a kernel

In [None]:
'''Several kernels (covariance functions) are implemented in GPflow. 
You can easily combine them to create new ones (see Manipulating kernels). 
You can also implement new covariance functions, as shown in the Kernel design notebook. 
Here, we will use a simple one:
'''

k = gpflow.kernels.Matern52() ###

# 用意されている kernel : https://gpflow.readthedocs.io/en/develop/gpflow/kernels/index.html

In [None]:
'''For more advanced kernels see the advanced kernel notebook
(including kernels defined on subspaces).
A summary of the kernel can be obtained by
'''

In [None]:
print_summary(k)

In [None]:
'''The Matern 5/2 kernel has two parameters: 
`lengthscales`, which encodes the "wiggliness" of the GP, and `variance`, which tunes the amplitude. 
They are both set to 1.0 as the default value. 
For more details on the meaning of the other columns, see Manipulating kernels.

## Choose a mean function (optional)
It is common to choose $\mu = 0$, which is the GPflow default.
However, if there is a clear pattern (such as a mean value of `Y` that is 
far away from 0, or a linear trend in the data), mean functions can  be beneficial. 
Some simple ones are provided in the `gpflow.mean_functions` module.
Here's how to define a linear mean function:
`meanf = gpflow.mean_functions.Linear()`
'''

##### Construct a model

In [None]:
'''A GPflow model is created by instantiating one of the GPflow model classes, in this case GPR. 
We'll make a kernel `k` and instantiate a GPR object using the generated data and the kernel. 
We'll also set the variance of the likelihood to a sensible initial guess.
'''

m = gpflow.models.GPR(data=(X, Y), kernel=k, mean_function=None)

In [None]:
# A summary of the model can be obtained by
print_summary(m)

In [None]:
# The first two lines correspond to the kernel parameters, 
# and the third one gives the likelihood parameter (the noise variance $\tau^2$ in our model).

# You can access those values and manually set them to sensible initial guesses.
# For example:
m.likelihood.variance.assign(0.01)
m.kernel.lengthscales.assign(0.3)

##### Optimize the model parameters

To obtain meaningful predictions, you need to tune the model parameters (that is, the parameters of the kernel, the likelihood, and the mean function if applicable) to the data at hand.

There are several optimizers available in GPflow. Here we use the `Scipy` optimizer, which by default implements the L-BFGS-B algorithm. \
(You can select other algorithms by using the `method=` keyword argument to its `minimize` method; see [the SciPy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) for details of available options.)

In [None]:
opt = gpflow.optimizers.Scipy()

In order to train the model, we need to maximize the log marginal likelihood.\
GPflow models define a `training_loss` that can be passed to the `minimize` method of an optimizer; in this case it is simply the negative log marginal likelihood.\
We also need to specify the variables to train with
`m.trainable_variables`, and the number of iterations.

In [None]:
opt_logs = opt.minimize(m.training_loss, m.trainable_variables, options=dict(maxiter=100))
print_summary(m)

In [None]:
'''Notice how the value column has changed.

The local optimum found by Maximum Likelihood might not be the one you want 
(for example, it might be overfitting or oversmooth). 

This depends on the initial values of the hyperparameters, and is specific to each dataset. 
As an alternative to Maximum Likelihood, Markov Chain Monte Carlo (MCMC) is also available.
'''

##### Make predictions

We can now use the model to make some predictions at the new points `Xnew`.\
You might be interested in predicting two different quantities: the latent function values `f(Xnew)` (the denoised signal), or the values of new observations `y(Xnew)` (signal + noise).\
Because we are dealing with Gaussian probabilistic models, the predictions typically produce a mean and variance as output.\
Alternatively, you can obtain samples of `f(Xnew)` or the log density of the new data points `(Xnew, Ynew)`.

GPflow models have several prediction methods:

 - `m.predict_f` returns the mean and marginal variance of $f$ at the points `Xnew`.

 - `m.predict_f` with argument `full_cov=True` returns the mean and the full covariance matrix of $f$ at the points `Xnew`.

 - `m.predict_f_samples` returns samples of the latent function.

 - `m.predict_y` returns the mean and variance of a new data point (that is, it includes the noise variance).

 - `m.predict_log_density` returns the log density of the observations `Ynew` at `Xnew`.

We use `predict_f` and `predict_f_samples` to plot 95% confidence intervals and samples from the posterior distribution.

In [None]:
## generate test points for prediction
xx = np.linspace(-0.1, 1.1, 100).reshape(100, 1)  # test points must be of shape (N, D)

## predict mean and variance of latent GP at test points
mean, var = m.predict_f(xx)

## generate 10 samples from posterior
tf.random.set_seed(1)  # for reproducibility
samples = m.predict_f_samples(xx, 10)  # shape (10, 100, 1)

## plot
plt.figure(figsize=(12, 6))
plt.plot(X, Y, "kx", mew=2)
plt.plot(xx, mean, "C0", lw=2)
plt.fill_between(
    xx[:, 0],
    mean[:, 0] - 1.96 * np.sqrt(var[:, 0]),
    mean[:, 0] + 1.96 * np.sqrt(var[:, 0]),
    color="C0",
    alpha=0.2,
)

plt.plot(xx, samples[:, :, 0].numpy().T, "C0", linewidth=0.5)
_ = plt.xlim(-0.1, 1.1)

In [None]:
'''## GP regression in higher dimensions

Very little changes when the input space has more than one dimension. 
By default, the `lengthscales` is an isotropic (scalar) parameter. 
It is generally recommended that you allow to tune a different lengthscale 
for each dimension (Automatic Relevance Determination, ARD): 
simply initialize `lengthscales` with an array of length $D$ corresponding to the input dimension of `X`.  
See Manipulating kernels for further information.
'''

#### <font color=green>**1.5.** </font> kernel ごとの比較

In [None]:
## 参考 : Illustration of prior and posterior Gaussian process for different kernels
##       https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_prior_posterior.html

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
                                              ExpSineSquared, DotProduct,
                                              ConstantKernel)

In [None]:
kernels = [RBF(length_scale=1.0, 
               length_scale_bounds=(0.1, 10.0)),
           RationalQuadratic(length_scale=1.0, 
                             alpha=0.1),
           ExpSineSquared(length_scale=1.0, 
                          periodicity=1.0,
                          length_scale_bounds=(0.1, 10.0),
                          periodicity_bounds=(0.1, 10.0)),
           Matern(length_scale=1.0, 
                  length_scale_bounds=(1e-1, 10.0),
                  nu=1.0)]

In [None]:
X_line = np.linspace(0, 5, 100)
rn = np.random.RandomState(4)

X_train = rn.uniform(0, 5, 80)[:, np.newaxis]   ## 60 -> 80
Y_train = np.tan((X_train[:, 0] - 2.5) **2)

In [None]:
def draw_graph(gauss, L):
  Y_mean, Y_std = gauss.predict(X_line[:, np.newaxis], return_std=True)
  plt.plot(X_line, Y_mean, 'b', lw=2, zorder=1)
  plt.fill_between(X_line, Y_mean - Y_std, Y_mean + Y_std,
                     alpha=0.2, color='k')
  Y_samples = gauss.sample_y(X_line[:, np.newaxis], 5)
  plt.plot(X_line, Y_samples, lw=0.5)
  plt.xlim(0, 5)
  plt.ylim(-3, 3)
  plt.scatter(X_train[:L, 0], Y_train[:L], c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
  plt.plot(X_line, np.tan((X_line- 2.5)**2), 'r', lw=2, zorder=1)

In [None]:
for kernel in kernels:
  gp = GaussianProcessRegressor(kernel=kernel)
  plt.figure(figsize=(50, 5))

  plt.subplot(1, 5, 1)
  draw_graph(gp,0)

  for i in range(1,5):    ## range(1,4) -> range(1,5)
    gp.fit(X_train[:20*i], Y_train[:20*i])
    plt.subplot(1, 5, 1+i)
    draw_graph(gp,20*i)

plt.show()

#### <font color=green>**1.6.** </font> scikit-learn ver. GPR on Mauna Loa CO2 data.

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpr_co2.html

# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
#
# License: BSD 3 clause

In [None]:
import numpy as np

from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared

In [None]:
def load_mauna_loa_atmospheric_co2():
  ml_data = fetch_openml(data_id=41187, as_frame=False)
  months = []
  ppmv_sums = []
  counts = []

  y = ml_data.data[:, 0]
  m = ml_data.data[:, 1]
  month_float = y + (m - 1) / 12
  ppmvs = ml_data.target

  for month, ppmv in zip(month_float, ppmvs):
    if not months or month != months[-1]:
      months.append(month)
      ppmv_sums.append(ppmv)
      counts.append(1)
    else:
      # aggregate monthly sum to produce average
      ppmv_sums[-1] += ppmv
      counts[-1] += 1

  months = np.asarray(months).reshape(-1, 1)
  avg_ppmvs = np.asarray(ppmv_sums) / counts
  return months, avg_ppmvs

In [None]:
X, y = load_mauna_loa_atmospheric_co2()

In [None]:
# Kernel with parameters given in GPML book
k1 = 66.0**2 * RBF(length_scale=67.0)  # long term smooth rising trend
k2 = 2.4**2 * RBF(length_scale=90.0) \
    * ExpSineSquared(length_scale=1.3, periodicity=1.0)  # seasonal component

# medium term irregularity
k3 = 0.66**2 \
    * RationalQuadratic(length_scale=1.2, alpha=0.78)
k4 = 0.18**2 * RBF(length_scale=0.134) \
    + WhiteKernel(noise_level=0.19**2)  # noise terms

kernel_gpml = k1 + k2 + k3 + k4

In [None]:
gp = GaussianProcessRegressor(kernel=kernel_gpml, alpha=0,
                              optimizer=None, normalize_y=True)
gp.fit(X, y)

In [None]:
print("GPML kernel: %s" % gp.kernel_)
print("Log-marginal-likelihood: %.3f"
      % gp.log_marginal_likelihood(gp.kernel_.theta))

In [None]:
X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
y_pred, y_std = gp.predict(X_, return_std=True)

In [None]:
# Illustration
plt.figure(figsize=(18,12))
plt.scatter(X, y, c='k', s=10)
plt.plot(X_, y_pred, linewidth=1)
plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
                 alpha=0.5, color='pink')
plt.xlim(X_.min(), X_.max())
plt.xlabel("Year")
plt.ylabel(r"CO$_2$ in ppm")
plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
plt.tight_layout()
plt.show()

In [None]:
# Kernel with optimized parameters
k1_2 = 50.0**2 * RBF(length_scale=50.0)  # long term smooth rising trend
k2_2 = 2.0**2 * RBF(length_scale=100.0) \
    * ExpSineSquared(length_scale=1.0, periodicity=1.0,
                     periodicity_bounds="fixed")  # seasonal component

# medium term irregularities
k3_2 = 0.5**2 * RationalQuadratic(length_scale=1.0, alpha=1.0)
k4_2 = 0.1**2 * RBF(length_scale=0.1) \
    + WhiteKernel(noise_level=0.1**2,
                  noise_level_bounds=(1e-5, np.inf))  # noise terms

kernel_2 = k1_2 + k2_2 + k3_2 + k4_2

In [None]:
gp2 = GaussianProcessRegressor(kernel=kernel_2, alpha=0,
                              normalize_y=True)
gp2.fit(X, y)

print("\nLearned kernel: %s" % gp2.kernel_)
print("Log-marginal-likelihood: %.3f"
      % gp2.log_marginal_likelihood(gp2.kernel_.theta))

In [None]:
X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
y_pred2, y_std2 = gp2.predict(X_, return_std=True)

In [None]:
# Illustration
plt.figure(figsize=(12,8))
plt.scatter(X, y, c='k')
plt.plot(X_, y_pred2)
plt.fill_between(X_[:, 0], y_pred2 - y_std2, y_pred2 + y_std2,
                 alpha=0.5, color='k')
plt.xlim(X_.min(), X_.max())
plt.xlabel("Year")
plt.ylabel(r"CO$_2$ in ppm")
plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(18,6))

plt.subplot(1,2,1)
plt.scatter(X, y, c='k', s=5)
plt.plot(X_, y_pred, linewidth=1)
plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
                 alpha=0.5, color='pink')
plt.xlim(X_.min(), X_.max())
plt.xlabel("Year")
plt.ylabel(r"CO$_2$ in ppm")
plt.title(r"Kernel with parameters given in GPML book")
plt.tight_layout()

plt.subplot(1,2,2)
plt.scatter(X, y, c='k', s=5)
plt.plot(X_, y_pred2, linewidth=1)
plt.fill_between(X_[:, 0], y_pred2 - y_std2, y_pred2 + y_std2,
                 alpha=0.5, color='pink')
plt.xlim(X_.min(), X_.max())
plt.xlabel("Year")
plt.ylabel(r"CO$_2$ in ppm")
plt.title(r"Kernel with optimized parameters")
plt.tight_layout()

plt.show()

In [None]:
plt.figure(figsize=(12,8))
#plt.scatter(X, y, c='k', s=5)
plt.plot(X_, y_pred, color='r')
plt.plot(X_, y_pred2, color='b')
plt.fill_between(X_[:, 0], y_pred - y_std, y_pred + y_std,
                 alpha=0.3, color='green')
plt.fill_between(X_[:, 0], y_pred2 - y_std2, y_pred2 + y_std2,
                 alpha=0.5, color='orange')

plt.xlim(X_.min(), X_.max())
plt.xlabel("Year")
plt.ylabel(r"CO$_2$ in ppm")
plt.title(r"Atmospheric CO$_2$ concentration at Mauna Loa")
plt.tight_layout()
plt.show()

#### <font color=green>**1.7.** </font> GPflow ver. Fitting a Gaussian process kernel (Mauna Loa CO2 data)

In [None]:
## 出典 : https://peterroelants.github.io/posts/gaussian-process-kernel-fitting/

In [None]:
# Imports
import os
import logging
import sys
import warnings
from itertools import islice

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp

from tqdm import tqdm
import bokeh
import bokeh.io
import bokeh.plotting
import bokeh.models
from IPython.display import display, HTML

In [None]:
warnings.simplefilter("ignore")
logger = tf.get_logger()
logger.setLevel(logging.ERROR)
bokeh.io.output_notebook(hide_banner=True)


tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Load the data from the Scripps CO2 program website. 
co2_df = pd.read_csv(
    'https://scrippsco2.ucsd.edu/assets/data/atmospheric/stations/in_situ_co2/monthly/monthly_in_situ_co2_mlo.csv',
    #'./monthly_in_situ_co2_mlo.csv', 
    header=54, # Data starts here
    skiprows=[55, 56], # Headers consist of multiple rows
    usecols=[3, 4], # Only keep the 'Date' and 'CO2' columns
    names=['Date', 'CO2'],
    na_values='-99.99',  # NaNs are denoted as '-99.99'
    dtype=np.float64
)

# Drop missing values
co2_df.dropna(inplace=True)
# Remove whitespace from column names
co2_df.rename(columns=lambda x: x.strip(), inplace=True)
#

In [None]:
# Plot data
fig = bokeh.plotting.figure(
    width=600, height=300, 
    x_range=(1958, 2020), y_range=(310, 420))
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'CO₂ (ppm)'
fig.add_layout(bokeh.models.Title(
    text='In situ air measurements at Mauna Loa, Observatory, Hawaii',
    text_font_style="italic"), 'above')
fig.add_layout(bokeh.models.Title(
    text='Atmospheric CO₂ concentrations', 
    text_font_size="14pt"), 'above')
fig.line(co2_df.Date, co2_df.CO2,
         legend_label='All data',
         line_width=2, line_color='midnightblue')
fig.legend.location = 'top_left'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#

In [None]:
# Split the data into observed and to predict
date_split_predict = 2008
df_observed = co2_df[co2_df.Date < date_split_predict]
print('{} measurements in the observed set'.format(len(df_observed)))
df_predict = co2_df[co2_df.Date >= date_split_predict]
print('{} measurements in the test set'.format(len(df_predict)))
#

In [None]:
# Define mean function which is the means of observations
observations_mean = tf.constant(
    [np.mean(df_observed.CO2.values)], dtype=tf.float64)
mean_fn = lambda _: observations_mean
#

In [None]:
# Define the kernel with trainable parameters. 
# Note we transform some of the trainable variables to ensure
#  they stay positive.

# Use float64 because this means that the kernel matrix will have 
#  less numerical issues when computing the Cholesky decomposition

# Constrain to make sure certain parameters are strictly positive
constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp())

In [None]:
# Smooth kernel hyperparameters
smooth_amplitude = tfp.util.TransformedVariable(
    initial_value=10., bijector=constrain_positive, dtype=np.float64,
    name='smooth_amplitude')
smooth_length_scale = tfp.util.TransformedVariable(
    initial_value=10., bijector=constrain_positive, dtype=np.float64,
    name='smooth_length_scale')

# Smooth kernel
smooth_kernel = tfk.ExponentiatedQuadratic(
    amplitude=smooth_amplitude, 
    length_scale=smooth_length_scale)

In [None]:
# Local periodic kernel hyperparameters
periodic_amplitude = tfp.util.TransformedVariable(
    initial_value=5.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_amplitude')
periodic_length_scale = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_length_scale')
periodic_period = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_period')
periodic_local_length_scale = tfp.util.TransformedVariable(
    initial_value=1.0, bijector=constrain_positive, dtype=np.float64,
    name='periodic_local_length_scale')

# Local periodic kernel
local_periodic_kernel = (
    tfk.ExpSinSquared(
        amplitude=periodic_amplitude, 
        length_scale=periodic_length_scale,
        period=periodic_period) * 
    tfk.ExponentiatedQuadratic(
        length_scale=periodic_local_length_scale))

In [None]:
# Short-medium term irregularities kernel hyperparameters
irregular_amplitude = tfp.util.TransformedVariable(
    initial_value=1., bijector=constrain_positive, dtype=np.float64,
    name='irregular_amplitude')
irregular_length_scale = tfp.util.TransformedVariable(
    initial_value=1., bijector=constrain_positive, dtype=np.float64,
    name='irregular_length_scale')
irregular_scale_mixture = tfp.util.TransformedVariable(
    initial_value=1., bijector=constrain_positive, dtype=np.float64,
    name='irregular_scale_mixture')

# Short-medium term irregularities kernel
irregular_kernel = tfk.RationalQuadratic(
    amplitude=irregular_amplitude,
    length_scale=irregular_length_scale,
    scale_mixture_rate=irregular_scale_mixture)

In [None]:
# Noise variance of observations
# Start out with a medium-to high noise
observation_noise_variance = tfp.util.TransformedVariable(
    initial_value=1, bijector=constrain_positive, dtype=np.float64,
    name='observation_noise_variance')

In [None]:
trainable_variables = [v.variables[0] for v in [
    smooth_amplitude,
    smooth_length_scale,
    periodic_amplitude,
    periodic_length_scale,
    periodic_period,
    periodic_local_length_scale,
    irregular_amplitude,
    irregular_length_scale,
    irregular_scale_mixture,
    observation_noise_variance
]]

#

In [None]:
# Sum all kernels to single kernel containing all characteristics
kernel = (smooth_kernel + local_periodic_kernel + irregular_kernel)

In [None]:
# Define mini-batch data iterator
batch_size = 128

batched_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (df_observed.Date.values.reshape(-1, 1), df_observed.CO2.values))
    .shuffle(buffer_size=len(df_observed))
    .repeat(count=None)
    .batch(batch_size)
)
#

In [None]:
@tf.function(autograph=False, experimental_compile=False)  # Use tf.function for more effecient function evaluation
def gp_loss_fn(index_points, observations):
  """Gaussian process negative-log-likelihood loss function."""
  gp = tfd.GaussianProcess(
      mean_fn=mean_fn,
      kernel=kernel,
      index_points=index_points,
      observation_noise_variance=observation_noise_variance
      )
    
  negative_log_likelihood = -gp.log_prob(observations)
  return negative_log_likelihood

In [None]:
# Fit hyperparameters
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
# Training loop
batch_nlls = []  # Batch NLL for plotting
full_ll = []  # Full data NLL for plotting
nb_iterations = 10001
for i, (index_points_batch, observations_batch) in tqdm(enumerate(islice(batched_dataset, nb_iterations)), file=sys.stdout):
  # Run optimization for single batch
  with tf.GradientTape() as tape:
    loss = gp_loss_fn(index_points_batch, observations_batch)
  grads = tape.gradient(loss, trainable_variables)
  optimizer.apply_gradients(zip(grads, trainable_variables))
  batch_nlls.append((i, loss.numpy()))
  # Evaluate on all observations
  if i % 100 == 0:
    # Evaluate on all observed data
    ll = gp_loss_fn(
        index_points=df_observed.Date.values.reshape(-1, 1),
        observations=df_observed.CO2.values)
    full_ll.append((i, ll.numpy()))

# 2分半くらいかかる

In [None]:
# Plot NLL over iterations
fig = bokeh.plotting.figure(width=600, height=400, 
                            x_range=(0, nb_iterations), y_range=(50, 200))
fig.add_layout(bokeh.models.Title(
    text='Negative Log-Likelihood (NLL) during training', 
    text_font_size="14pt"), 'above')
fig.xaxis.axis_label = 'iteration'
fig.yaxis.axis_label = 'NLL batch'

# First plot
fig.line(*zip(*batch_nlls), legend_label='Batch data',
         line_width=2, line_color='midnightblue')

# Seoncd plot
# Setting the second y axis range name and range
fig.extra_y_ranges = {'fig1ax2': bokeh.models.Range1d(start=130, end=250)}
fig.line(*zip(*full_ll), legend_label='All observed data',
         line_width=2, line_color='red', y_range_name='fig1ax2')
# Adding the second axis to the plot.  
fig.add_layout(bokeh.models.LinearAxis(
    y_range_name='fig1ax2', axis_label='NLL all'), 'right')

fig.legend.location = 'top_right'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#

In [None]:
# Show values of parameters found
variables = [
    smooth_amplitude,
    smooth_length_scale,
    periodic_amplitude,
    periodic_length_scale,
    periodic_period,
    periodic_local_length_scale,
    irregular_amplitude,
    irregular_length_scale,
    irregular_scale_mixture,
    observation_noise_variance
]

data = list([(var.variables[0].name[:-2], var.numpy()) for var in variables])
df_variables = pd.DataFrame(
    data, columns=['Hyperparameters', 'Value'])
display(HTML(df_variables.to_html(
    index=False, float_format=lambda x: f'{x:.4f}')))
#

In [None]:
# Posterior GP using fitted kernel and observed data
gp_posterior_predict = tfd.GaussianProcessRegressionModel(
    mean_fn=mean_fn,
    kernel=kernel,
    index_points=df_predict.Date.values.reshape(-1, 1),
    observation_index_points=df_observed.Date.values.reshape(-1, 1),
    observations=df_observed.CO2.values,
    observation_noise_variance=observation_noise_variance)

# Posterior mean and standard deviation
posterior_mean_predict = gp_posterior_predict.mean()
posterior_std_predict = gp_posterior_predict.stddev()

In [None]:
# Plot posterior predictions

# Get posterior predictions
μ = posterior_mean_predict.numpy()
σ = posterior_std_predict.numpy()

# Plot
fig = bokeh.plotting.figure(
    width=600, height=400,
    x_range=(2008, 2021), y_range=(380, 415))
fig.xaxis.axis_label = 'Date'
fig.yaxis.axis_label = 'CO₂ (ppm)'
fig.add_layout(bokeh.models.Title(
    text='Posterior predictions conditioned on observations before 2008.',
    text_font_style="italic"), 'above')
fig.add_layout(bokeh.models.Title(
    text='Atmospheric CO₂ concentrations', 
    text_font_size="14pt"), 'above')
fig.circle(
    co2_df.Date, co2_df.CO2, legend_label='True data',
    size=2, line_color='midnightblue')
fig.line(
    df_predict.Date.values, μ, legend_label='μ (predictions)',
    line_width=2, line_color='firebrick')
# Prediction interval
band_x = np.append(
    df_predict.Date.values, df_predict.Date.values[::-1])
band_y = np.append(
    (μ + 2*σ), (μ - 2*σ)[::-1])
fig.patch(
    band_x, band_y, color='firebrick', alpha=0.4, 
    line_color='firebrick', legend_label='2σ')

fig.legend.location = 'top_left'
fig.toolbar.autohide = True
bokeh.plotting.show(fig)
#

In [None]:
# Version info
'''Python: 3.8.5
Numpy: 1.18.5
Pandas: 1.1.3
TensorFlow: 2.3.1
TensorFlow Probability: 0.11.1
Bokeh: 2.2.2
'''

print('Python: {}.{}.{}'.format(*sys.version_info[:3]))
print('Numpy: {}'.format(np.__version__))
print('Pandas: {}'.format(pd.__version__))
print('TensorFlow: {}'.format(tf.__version__))
print('TensorFlow Probability: {}'.format(tfp.__version__))
print('Bokeh: {}'.format(bokeh.__version__))
#

### <font color=blue>**2.** </font> ガウス過程　関連度自動決定（GPARD）

In [None]:
from sklearn import preprocessing
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

import numpy as np

In [None]:
diabetes = load_diabetes()

sc = preprocessing.StandardScaler()
sc.fit(diabetes.data)
diabetes_data = sc.transform(diabetes.data)
diabetes_target = diabetes.target

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(diabetes_data, diabetes.target)

X=[i for i in range(10)]
Y_train = Y_train[:, np.newaxis]
n_features=len(X_train[0])

In [None]:
import GPy

# カーネルを定義する。
kernel = GPy.kern.RBF(input_dim=n_features, variance=0.01, ARD=True)
# kernel = GPy.kern.Matern52(n_features, ARD=True)

# ガウス過程を用いた回帰を行う。
model = GPy.models.GPRegression(X_train, Y_train, kernel)

# 最適化（MAP推定）を行う。
model.optimize(messages=True, max_iters=1e5)

In [None]:
import matplotlib.pyplot as plt

# 学習後のスケール長を取り出す。
ls = list(model.kern.lengthscale)

# その逆数が説明変数の寄与の大きさ
weights = [1 / v for v in ls]

plt.figure(figsize=(10, 6))
plt.ylabel("$1/l_m$")
plt.xlabel("$m$")
xs = list(range(len(weights)))
plt.bar(xs, weights)
plt.show()

### <font color=blue>**3.** </font> ガウス過程分類（GPC）

#### <font color=green>**3.1.** </font> Gaussian process classification (GPC) on iris dataset

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpc_iris.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [None]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = np.array(iris.target, dtype=int)

In [None]:
kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
kernel = 1.0 * RBF([1.0, 1.0])
gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

In [None]:
h = .02  # step size in the mesh

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
titles = ["Isotropic RBF", "Anisotropic RBF"]
plt.figure(figsize=(10, 5))
for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)):
  # Plot the predicted probabilities. For that, we will assign a color to
  # each point in the mesh [x_min, m_max]x[y_min, y_max].
  plt.subplot(1, 2, i + 1)
  Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])

  # Put the result into a color plot
  Z = Z.reshape((xx.shape[0], xx.shape[1], 3))
  plt.imshow(Z, extent=(x_min, x_max, y_min, y_max), origin="lower")

  # Plot also the training points
  plt.scatter(X[:, 0], X[:, 1], c=np.array(["r", "g", "b"])[y],
              edgecolors=(1, 1, 1)  ## (0, 0, 0) -> (1, 1, 1)
              )
  plt.xlabel('Sepal length')
  plt.ylabel('Sepal width')
  plt.xlim(xx.min(), xx.max())
  plt.ylim(yy.min(), yy.max())
  plt.xticks(())
  plt.yticks(())
  plt.title("%s, LML: %.3f" %
              (titles[i], clf.log_marginal_likelihood(clf.kernel_.theta)))

plt.tight_layout()
plt.show()

#### <font color=green>**3.2.** </font> Iso-probability lines for Gaussian Processes classification (GPC)

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpc_isoprobability.html

# Author: Vincent Dubourg <vincent.dubourg@gmail.com>
# Adapted to GaussianProcessClassifier:
#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD 3 clause

In [None]:
import numpy as np

from matplotlib import pyplot as plt
from matplotlib import cm

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import DotProduct, ConstantKernel as C

In [None]:
def g(x):
  """The function to predict (classification will then consist in predicting
  whether g(x) <= 0 or not)"""
  return 5. - x[:, 1] - .5 * x[:, 0] ** 2.

In [None]:
# A few constants
lim = 8

In [None]:
# Design of experiments
X = np.array([[-4.61611719, -6.00099547],
              [4.10469096, 5.32782448],
              [0.00000000, -0.50000000],
              [-6.17289014, -4.6984743],
              [1.3109306, -6.93271427],
              [-5.03823144, 3.10584743],
              [-2.87600388, 6.74310541],
              [5.21301203, 4.26386883]])

In [None]:
# Observations
y = np.array(g(X) > 0, dtype=int)

In [None]:
# Instantiate and fit Gaussian Process Model
kernel = C(0.1, (1e-5, np.inf)) * DotProduct(sigma_0=0.1) ** 2
gp = GaussianProcessClassifier(kernel=kernel)
gp.fit(X, y)
print("Learned kernel: %s " % gp.kernel_)

In [None]:
# Evaluate real function and the predicted probability
res = 50
x1, x2 = np.meshgrid(np.linspace(- lim, lim, res),
                     np.linspace(- lim, lim, res))
xx = np.vstack([x1.reshape(x1.size), x2.reshape(x2.size)]).T

y_true = g(xx)
y_prob = gp.predict_proba(xx)[:, 1]
y_true = y_true.reshape((res, res))
y_prob = y_prob.reshape((res, res))

In [None]:
# Plot the probabilistic classification iso-values
fig = plt.figure(1, figsize=(10,10))  ###
ax = fig.gca()
ax.axes.set_aspect('equal')
plt.xticks([])
plt.yticks([])
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.xlabel('$x_1$')
plt.ylabel('$x_2$')

cax = plt.imshow(y_prob, cmap=cm.gray_r, alpha=0.8, extent=(-lim, lim, -lim, lim))
norm = plt.matplotlib.colors.Normalize(vmin=0., vmax=0.9)
cb = plt.colorbar(cax, ticks=[0., 0.2, 0.4, 0.6, 0.8, 1.], norm=norm)
cb.set_label(r'${\rm \mathbb{P}}\left[\widehat{G}(\mathbf{x}) \leq 0\right]$')
plt.clim(0, 1)

plt.plot(X[y <= 0, 0], X[y <= 0, 1], 'r.', markersize=12)
plt.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12)

plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot')

cs = plt.contour(x1, x2, y_prob, [0.666], colors='b', linestyles='solid')
plt.clabel(cs, fontsize=11)

cs = plt.contour(x1, x2, y_prob, [0.5], colors='k', linestyles='dashed')
plt.clabel(cs, fontsize=11)

cs = plt.contour(x1, x2, y_prob, [0.334], colors='r', linestyles='solid')
plt.clabel(cs, fontsize=11)

plt.show()

#### <font color=green>**3.3.** </font> Probabilistic predictions with Gaussian process classification (GPC)

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpc.html

# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
#
# License: BSD 3 clause

In [None]:
import numpy as np

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [None]:
# Generate data
train_size = 50
rng = np.random.RandomState(0)
X = rng.uniform(0, 5, 100)[:, np.newaxis]
y = np.array(X[:, 0] > 2.5, dtype=int)

In [None]:
# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))

In [None]:
# Plot posteriors
plt.figure(figsize=(12,8))  ###
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data",
            edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data",
            edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
         label="Optimized kernel: %s" % gp_opt.kernel_)
plt.xlabel("Feature")
plt.ylabel("Class 1 probability")
plt.xlim(0, 5)
plt.ylim(-0.25, 1.5)
plt.legend(loc="best")
plt.show()

In [None]:
# Plot LML landscape
plt.figure(figsize=(12,8))  ###
theta0 = np.logspace(0, 8, 30)
theta1 = np.logspace(-1, 1, 29)
Theta0, Theta1 = np.meshgrid(theta0, theta1)
LML = [[gp_opt.log_marginal_likelihood(np.log([Theta0[i, j], Theta1[i, j]]))
        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
LML = np.array(LML).T
plt.plot(np.exp(gp_fix.kernel_.theta)[0], np.exp(gp_fix.kernel_.theta)[1],
         'ko', zorder=10)
plt.plot(np.exp(gp_opt.kernel_.theta)[0], np.exp(gp_opt.kernel_.theta)[1],
         'ko', zorder=10)
plt.pcolor(Theta0, Theta1, LML)
plt.xscale("log")
plt.yscale("log")
plt.colorbar()
plt.xlabel("Magnitude")
plt.ylabel("Length-scale")
plt.title("Log-marginal-likelihood")

plt.show()

#### <font color=green>**3.4.** </font> Illustration of Gaussian process classification (GPC) on the XOR dataset

In [None]:
## 出典 : https://scikit-learn.org/stable/auto_examples/gaussian_process/plot_gpc_xor.html

# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
#
# License: BSD 3 clause

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct

In [None]:
xx, yy = np.meshgrid(np.linspace(-3, 3, 50),
                     np.linspace(-3, 3, 50))
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

In [None]:
'''
排他的論理和 xor （exclusive or）とは、
1 xor 1 = 0 
1 xor 0 = 1
0 xor 1 = 1
0 xor 0 = 0
と定められるものである。
'''

In [None]:
kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]

In [None]:
# fit the model
plt.figure(figsize=(10, 5))
for i, kernel in enumerate(kernels):
  clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)

  # plot the decision function for each datapoint on the grid
  Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
  Z = Z.reshape(xx.shape)

  plt.subplot(1, 2, i + 1)
  image = plt.imshow(Z, interpolation='nearest',
                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                       aspect='auto', origin='lower', cmap=plt.cm.PuOr_r)
  contours = plt.contour(xx, yy, Z, levels=[0.5], linewidths=2,
                           colors=['k'])
  plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired,
                edgecolors=(0, 0, 0))
  plt.xticks(())
  plt.yticks(())
  plt.axis([-3, 3, -3, 3])
  plt.colorbar(image)
  plt.title("%s\n Log-Marginal-Likelihood:%.3f"
          % (clf.kernel_, clf.log_marginal_likelihood(clf.kernel_.theta)),
            fontsize=12)

plt.tight_layout()
plt.show()

#### <font color=green>**3.5.** </font> Basic (binary) GP classification model

In [None]:
## 出典 : https://gpflow.readthedocs.io/en/develop/notebooks/basics/classification.html

'''This notebook shows how to build a GP classification model using variational inference.
Here we consider binary (two-class, 0 vs. 1) classification only (there is a separate notebook on multiclass classification.
We first look at a one-dimensional example, and then show how you can adapt this when the input space is two-dimensional.
'''

In [None]:
import numpy as np
import gpflow
import tensorflow as tf

import matplotlib.pyplot as plt

%matplotlib inline

plt.rcParams["figure.figsize"] = (8, 4)

##### One-dimensional example

In [None]:
## First of all, let's have a look at the data. `X` and `Y` denote the input and output values.

**NOTE:** `X` and `Y` must be two-dimensional NumPy arrays, $N \times 1$ or $N \times D$, where $D$ is the number of input dimensions/features, with the same number of rows as $N$ (one for each data point):

In [None]:
X = np.genfromtxt("https://raw.githubusercontent.com/GPflow/docs/develop/doc/source/notebooks/basics/data/classif_1D_X.csv").reshape(-1, 1)
Y = np.genfromtxt("https://raw.githubusercontent.com/GPflow/docs/develop/doc/source/notebooks/basics/data/classif_1D_Y.csv").reshape(-1, 1)

plt.figure(figsize=(10, 6))
_ = plt.plot(X, Y, "C3x", ms=8, mew=2)

In [None]:
## Reminders on GP classification

# For a binary classification model using GPs, we can simply use a `Bernoulli` likelihood. 
# The details of the generative model are as follows:

__1. Define the latent GP:__ we start from a Gaussian process $f \sim \mathcal{GP}(0, k(\cdot, \cdot'))$:

In [None]:
# build the kernel and covariance matrix
k = gpflow.kernels.Matern52(variance=20.0)
x_grid = np.linspace(0, 6, 200).reshape(-1, 1)
K = k(x_grid)

# sample from a multivariate normal
rng = np.random.RandomState(6)

L = np.linalg.cholesky(K)
f_grid = np.dot(L, rng.randn(200, 5))
plt.plot(x_grid, f_grid, "C0", linewidth=1)
_ = plt.plot(x_grid, f_grid[:, 1], "C0", linewidth=2)

__2. Squash them to $[0, 1]$:__ the samples of the GP are mapped to $[0, 1]$.

By default, GPflow uses the standard normal cumulative distribution function (inverse probit function): $p(x) = \Phi(f(x)) = \frac{1}{2} (1 + \operatorname{erf}(x / \sqrt{2}))$.\
This choice has the advantage that predictive mean, variance and density can be computed analytically, but any choice of invlink is possible, e.g. the logit $p(x) = \frac{\exp(f(x))}{1 + \exp(f(x))}$.\
Simply pass another function as the `invlink` argument to the `Bernoulli` likelihood class.

In [None]:
def invlink(f):
    return gpflow.likelihoods.Bernoulli().invlink(f).numpy()


p_grid = invlink(f_grid)
plt.plot(x_grid, p_grid, "C1", linewidth=1)
_ = plt.plot(x_grid, p_grid[:, 1], "C1", linewidth=2)

__3. Sample from a Bernoulli:__ for each observation point $X_i$, the class label $Y_i \in \{0, 1\}$ is generated by sampling from a Bernoulli distribution $Y_i \sim \mathcal{B}(g(X_i))$.

In [None]:
# Select some input locations
ind = rng.randint(0, 200, (30,))
X_gen = x_grid[ind]

# evaluate probability and get Bernoulli draws
p = p_grid[ind, 1:2]
Y_gen = rng.binomial(1, p)

# plot
plt.plot(x_grid, p_grid[:, 1], "C1", linewidth=2)
plt.plot(X_gen, p, "C1o", ms=6)
_ = plt.plot(X_gen, Y_gen, "C3x", ms=8, mew=2)

In [None]:
#### Implementation with GPflow

For the model described above, the posterior $f(x)|Y$ (say $p$) is not Gaussian any more and does not have a closed-form expression.\
A common approach is then to look for the best approximation of this posterior by a tractable distribution (say $q$) such as a Gaussian distribution.\
In variational inference, the quality of an approximation is measured by the Kullback-Leibler divergence $\mathrm{KL}[q \| p]$.
For more details on this model, see Nickisch and Rasmussen (2008).

The inference problem is thus turned into an optimization problem: finding the best parameters for $q$.\
In our case, we introduce $U \sim \mathcal{N}(q_\mu, q_\Sigma)$, and we choose $q$ to have the same distribution as $f | f(X) = U$.\
The parameters $q_\mu$ and $q_\Sigma$ can be seen as parameters of $q$, which can be optimized in order to minimise  $\mathrm{KL}[q \| p]$.

This variational inference model is called `VGP` in GPflow:

In [None]:
m = gpflow.models.VGP(
    (X, Y), likelihood=gpflow.likelihoods.Bernoulli(), kernel=gpflow.kernels.Matern52()
)

opt = gpflow.optimizers.Scipy()
opt.minimize(m.training_loss, variables=m.trainable_variables)

In [None]:
# We can now inspect the result of the optimization with `gpflow.utilities.print_summary(m)`:
gpflow.utilities.print_summary(m, fmt="notebook")

In [None]:
# In this table, the first two lines are associated with the kernel parameters, 
# and the last two correspond to the variational parameters.

**NOTE:** In practice, $q_\Sigma$ is actually parameterized by its lower-triangular square root $q_\Sigma = q_\text{sqrt} q_\text{sqrt}^T$ in order to ensure its positive-definiteness.

For more details on how to handle models in GPflow (getting and setting parameters, fixing some of them during optimization, using priors, and so on), see Manipulating GPflow models.

In [None]:
#### Predictions

Finally, we will see how to use model predictions to plot the resulting model.\
We will replicate the figures of the generative model above, but using the approximate posterior distribution given by the model.

In [None]:
plt.figure(figsize=(12, 8))

# bubble fill the predictions
mu, var = m.predict_f(x_grid)

plt.fill_between(
    x_grid.flatten(),
    np.ravel(mu + 2 * np.sqrt(var)),
    np.ravel(mu - 2 * np.sqrt(var)),
    alpha=0.3,
    color="C0",
)

# plot samples
tf.random.set_seed(6)
samples = m.predict_f_samples(x_grid, 10).numpy().squeeze().T

plt.plot(x_grid, samples, "C0", lw=1)

# plot p-samples
p = invlink(samples)
plt.plot(x_grid, p, "C1", lw=1)

# plot data
plt.plot(X, Y, "C3x", ms=8, mew=2)
plt.ylim((-3, 3))

##### Two-dimensional example

In [None]:
# In this section we will use the following data:
X = np.loadtxt("https://raw.githubusercontent.com/GPflow/docs/develop/doc/source/notebooks/basics/data/banana_X_train", delimiter=",")
Y = np.loadtxt("https://raw.githubusercontent.com/GPflow/docs/develop/doc/source/notebooks/basics/data/banana_Y_train", delimiter=",").reshape(-1, 1)
mask = Y[:, 0] == 1

plt.figure(figsize=(6, 6))
plt.plot(X[mask, 0], X[mask, 1], "oC0", mew=0, alpha=0.5)
_ = plt.plot(X[np.logical_not(mask), 0], X[np.logical_not(mask), 1], "oC1", mew=0, alpha=0.5)

In [None]:
# The model definition is the same as above; 
# the only important difference is that we now specify that the kernel operates over a two-dimensional input space:
m = gpflow.models.VGP(
    (X, Y), kernel=gpflow.kernels.SquaredExponential(), likelihood=gpflow.likelihoods.Bernoulli()
)

opt = gpflow.optimizers.Scipy()
opt.minimize(
    m.training_loss, variables=m.trainable_variables, options=dict(maxiter=25), method="L-BFGS-B"
)
# in practice, the optimization needs around 250 iterations to converge

We can now plot the predicted decision boundary between the two classes.\
To do so, we can equivalently plot the contour lines $E[f(x)|Y]=0$, or $E[g(f(x))|Y]=0.5$.\
We will do the latter, because it allows us to introduce the `predict_y` function, which returns the mean and variance at test points:

In [None]:
x_grid = np.linspace(-3, 3, 40)
xx, yy = np.meshgrid(x_grid, x_grid)
Xplot = np.vstack((xx.flatten(), yy.flatten())).T

p, _ = m.predict_y(Xplot)  # here we only care about the mean
plt.figure(figsize=(7, 7))
plt.plot(X[mask, 0], X[mask, 1], "oC0", mew=0, alpha=0.5)
plt.plot(X[np.logical_not(mask), 0], X[np.logical_not(mask), 1], "oC1", mew=0, alpha=0.5)

_ = plt.contour(
    xx,
    yy,
    p.numpy().reshape(*xx.shape),
    [0.5],  # plot the p=0.5 contour line only
    colors="k",
    linewidths=1.8,
    zorder=100,
)

In [None]:
## References
# Hannes Nickisch and Carl Edward Rasmussen. 'Approximations for binary Gaussian process classification'. 
# Journal of Machine Learning Research 9(Oct):2035--2078, 2008.

### <font color=blue>**4.** </font> ガウス過程　潜在変数モデル（SVGP）

#### <font color=green>**4.1.** </font> Stochastic Variational Inference for scalability with SVGP

One of the main criticisms of Gaussian processes is their scalability to large datasets.\
In this notebook, we illustrate how to use the state-of-the-art Stochastic Variational Gaussian Process (SVGP) (*Hensman, et. al. 2013*) to overcome this problem.

In [None]:
%matplotlib inline
import itertools
import numpy as np
import time
import gpflow
import tensorflow as tf
import matplotlib.pyplot as plt
from gpflow.ci_utils import ci_niter

plt.style.use("ggplot")

# for reproducibility of this notebook:
rng = np.random.RandomState(123)
tf.random.set_seed(42)

In [None]:
## Generating data

For this notebook example, we generate 10,000 noisy observations from a test function:
\begin{equation}
f(x) = \sin(3\pi x) + 0.3\cos(9\pi x) + \frac{\sin(7 \pi x)}{2}
\end{equation}

In [None]:
def func(x):
    return np.sin(x * 3 * 3.14) + 0.3 * np.cos(x * 9 * 3.14) + 0.5 * np.sin(x * 7 * 3.14)


N = 10000  # Number of training observations

X = rng.rand(N, 1) * 2 - 1  # X values
Y = func(X) + 0.2 * rng.randn(N, 1)  # Noisy Y values
data = (X, Y)

In [None]:
# We plot the data along with the noiseless generating function:
plt.plot(X, Y, "x", alpha=0.2)
Xt = np.linspace(-1.1, 1.1, 1000)[:, None]
Yt = func(Xt)
_ = plt.plot(Xt, Yt, c="k")

In [None]:
## Building the model
'''The main idea behind SVGP is to approximate the true GP posterior 
with a GP conditioned on a small set of "inducing" values. 
This smaller set can be thought of as summarizing the larger dataset. 
For this example, we will select a set of 50 inducing locations that are initialized from the training dataset:
'''

In [None]:
M = 50  # Number of inducing locations

kernel = gpflow.kernels.SquaredExponential()
Z = X[:M, :].copy()  # Initialize inducing locations to the first M inputs in the dataset

m = gpflow.models.SVGP(kernel, gpflow.likelihoods.Gaussian(), Z, num_data=N)

In [None]:
## Likelihood computation: batch vs. minibatch
# First we showcase the model's performance using the whole dataset to compute the ELBO.

In [None]:
elbo = tf.function(m.elbo)

In [None]:
# TensorFlow re-traces & compiles a `tf.function`-wrapped method at every call 
# if the arguments are numpy arrays instead of tf.Tensors. Hence:
tensor_data = tuple(map(tf.convert_to_tensor, data))
elbo(tensor_data)  # run it once to trace & compile

In [None]:
%%timeit
elbo(tensor_data)

In [None]:
# We can speed up this calculation by using minibatches of the data. 
# For this example, we use minibatches of size 100.

minibatch_size = 100

train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat().shuffle(N)

train_iter = iter(train_dataset.batch(minibatch_size))

ground_truth = elbo(tensor_data).numpy()

In [None]:
%%timeit
elbo(next(train_iter))

In [None]:
## Stochastical estimation of ELBO
'''The minibatch estimate should be an unbiased estimator of the `ground_truth`. 
Here we show a histogram of the value from different evaluations, 
together with its mean and the ground truth. 
The small difference between the mean of the minibatch estimations and 
the ground truth shows that the minibatch estimator is working as expected.
'''

In [None]:
evals = [elbo(minibatch).numpy() for minibatch in itertools.islice(train_iter, 100)]

In [None]:
plt.hist(evals, label="Minibatch estimations")
plt.axvline(ground_truth, c="k", label="Ground truth")
plt.axvline(np.mean(evals), c="g", ls="--", label="Minibatch mean")
plt.legend()
plt.title("Histogram of ELBO evaluations using minibatches")
print("Discrepancy between ground truth and minibatch estimate:", ground_truth - np.mean(evals))

In [None]:
## Minibatches speed up computation
'''The reason for using minibatches is that it decreases the time needed to make an optimization step, 
because estimating the objective is computationally cheaper with fewer data points. 
Here we plot the change in time required with the size of the minibatch. 
We see that smaller minibatches result in a cheaper estimate of the objective.
'''

In [None]:
# Evaluate objective for different minibatch sizes
minibatch_proportions = np.logspace(-2, 0, 10)
times = []
objs = []
for mbp in minibatch_proportions:
    batchsize = int(N * mbp)
    train_iter = iter(train_dataset.batch(batchsize))
    start_time = time.time()
    objs.append([elbo(minibatch) for minibatch in itertools.islice(train_iter, 20)])
    times.append(time.time() - start_time)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
ax1.plot(minibatch_proportions, times, "x-")
ax1.set_xlabel("Minibatch proportion")
ax1.set_ylabel("Time taken")

ax2.plot(minibatch_proportions, np.array(objs), "kx")
ax2.set_xlabel("Minibatch proportion")
ax2.set_ylabel("ELBO estimates")

In [None]:
## Running stochastic optimization
# First we create a utility function that plots the model's predictions:

In [None]:
def plot(title=""):
    plt.figure(figsize=(12, 4))
    plt.title(title)
    pX = np.linspace(-1, 1, 100)[:, None]  # Test locations
    pY, pYv = m.predict_y(pX)  # Predict Y values at test locations
    plt.plot(X, Y, "x", label="Training points", alpha=0.2)
    (line,) = plt.plot(pX, pY, lw=1.5, label="Mean of predictive posterior")
    col = line.get_color()
    plt.fill_between(
        pX[:, 0],
        (pY - 2 * pYv ** 0.5)[:, 0],
        (pY + 2 * pYv ** 0.5)[:, 0],
        color=col,
        alpha=0.6,
        lw=1.5,
    )
    Z = m.inducing_variable.Z.numpy()
    plt.plot(Z, np.zeros_like(Z), "k|", mew=2, label="Inducing locations")
    plt.legend(loc="lower right")


plot(title="Predictions before training")

In [None]:
'''Now we can train our model. 
For optimizing the ELBO, we use the Adam Optimizer (Kingma and Ba 2015) 
which is designed for stochastic objective functions. 
We create a `run_adam` utility function  to perform the optimization.
'''

In [None]:
minibatch_size = 100

# We turn off training for inducing point locations
gpflow.set_trainable(m.inducing_variable, False)


def run_adam(model, iterations):
    """
    Utility function running the Adam optimizer
    
    :param model: GPflow model
    :param interations: number of iterations
    """
    # Create an Adam Optimizer action
    logf = []
    train_iter = iter(train_dataset.batch(minibatch_size))
    training_loss = model.training_loss_closure(train_iter, compile=True)
    optimizer = tf.optimizers.Adam()

    @tf.function
    def optimization_step():
        optimizer.minimize(training_loss, model.trainable_variables)

    for step in range(iterations):
        optimization_step()
        if step % 10 == 0:
            elbo = -training_loss().numpy()
            logf.append(elbo)
    return logf

In [None]:
# Now we run the optimization loop for 20,000 iterations.

maxiter = ci_niter(20000)

logf = run_adam(m, maxiter)
plt.plot(np.arange(maxiter)[::10], logf)
plt.xlabel("iteration")
_ = plt.ylabel("ELBO")

In [None]:
# Finally, we plot the model's predictions.
plot("Predictions after training")

In [None]:
## References:
# Hensman, James, Nicolo Fusi, and Neil D. Lawrence. "Gaussian processes for big data." Uncertainty in Artificial Intelligence (2013).
# Kingma, Diederik P., and Jimmy Ba. "Adam: A method for stochastic optimization." arXiv preprint arXiv:1412.6980 (2014).

#### <font color=green>**4.2.** </font> Oil Flow Data

In [None]:
## datasetの説明 : https://inverseprobability.com/3PhaseData.html

In [None]:
from pylab import *
import matplotlib.mlab 
import numpy

X = loadtxt('https://raw.githubusercontent.com/jiai-tus/FirstTerm/main/20210427/OilFlowData/DataTrn.txt')
Y = loadtxt('https://raw.githubusercontent.com/jiai-tus/FirstTerm/main/20210427/OilFlowData/DataTrnLbls.txt')

In [None]:
X

In [None]:
Y= [Y[i,0]*0 +Y[i,1]*1+Y[i,2]*2 for i in range(len(Y)) ]

In [None]:
from sklearn import datasets, preprocessing
sc=preprocessing.StandardScaler()
sc.fit(X)
X=sc.transform(X)

In [None]:
import GPy

input_dim =2
kernel = GPy.kern.Linear(input_dim)+ GPy.kern.RBF(input_dim, ARD=True)+ GPy.kern.Matern52(input_dim, ARD=True) + GPy.kern.Bias(input_dim)+ GPy.kern.White(input_dim)
model = GPy.models.GPLVM(X, input_dim=input_dim, kernel=kernel)
model.optimize(messages=True, max_iters=1e3)

# 7分くらいかかる

In [None]:
model.plot_latent(labels=Y,figsize=(10,10))

In [None]:
kernels = [GPy.kern.Linear(input_dim),
           GPy.kern.RBF(input_dim, ARD=True),
           GPy.kern.Matern52(input_dim, ARD=True)]

In [None]:
for i in kernels:
  model = GPy.models.GPLVM(X, input_dim=input_dim, kernel=i)
  model.optimize(messages=True, max_iters=1e3)
  model.plot_latent(labels=Y,figsize=(10,10))

# total 14分くらいかかる