## 42. 変分ベイズ（Variational Bayes）　続き

### <font color = blue>**4.** </font> ベイズ・ロジスティック回帰

In [None]:
## 出典： https://github.com/msamunetogetoge

In [None]:
import numpy as np
import pandas as pd

from scipy.special import expit as sigmoid

from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix

In [None]:
def likelifood(w, w_0, S, t, x):
  w_1 = w - w_0
  y = sigmoid(np.dot(w.T, x)).reshape(-1,)
  lf = -1/2 * np.dot(np.dot(w_1.T, np.linalg.inv(S)), w_1) + np.sum(t*np.log(y) + (1-t)*np.log(1-y))
  return lf

In [None]:
class SGD():

  def __init__(self, w_0, S_0, x, t, rate, th):
    self.r = rate
    self.w_0 = w_0
    self.S_0 = S_0
    self.x = x
    self.t = t
    self.th = th
  
  def diff(self, w):
    Nd = -np.dot((w-self.w_0).T, np.linalg.inv(self.S_0.T))
    Br = np.dot(t.T - sigmoid(np.dot(w.T, self.x)), self.x.T)
    d = Nd+Br
    d = d.reshape(-1,1)
    return d
  
  def learn(self, N):
    self.w = self.w_0
    for i in range(N):
      self.d = self.diff(self.w)
      self.w_new = self.w - self.r * self.d
      lf_new = likelifood(self.w_new, self.w_0, self.S_0, self.t, self.x)
      lf_old = likelifood(self.w, self.w_0, self.S_0, self.t, self.x)
      c = (lf_new - lf_old)/lf_old
      self.w = self.w_new
      self.lf = lf_new
      if i % int(N/10) == 0:
        print("End iteration of {}".format(i+1))
      if np.abs(c)<self.th:
        print("Convergence!! itr={}".format(i+1))
        break
    print("End the learning")
    return self.w_new

In [None]:
## データの準備

iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target_names[iris.target]
df.head()

In [None]:
df.target.unique()

In [None]:
data = df.target=="setosa"
df_reg_index = df[data].index
df_reg = df.drop(index=df_reg_index)
df_reg.head()

In [None]:
df_reg.tail()

In [None]:
y = df_reg.target
x = df_reg.drop(columns="target")

stats = x.describe().T
def norm(x):
  return (x - stats['mean']) /stats['std']

m=len(x.columns)
w_0 = np.zeros((m,1))
S=np.eye(m)

x=norm(x).T
#x=x.T

In [None]:
oe = OrdinalEncoder()
encoded = oe.fit_transform(np.array(y).reshape(-1,1))
t = encoded

In [None]:
SG = SGD(w_0=w_0, S_0=S, x=x, t=t, rate=0.0001,th=0.00001)
w_new = SG.learn(10000)

In [None]:
SG.diff(w_0)

In [None]:
y = sigmoid(np.dot(w_new.T, x))
S = -np.linalg.inv(S) + np.sum(y*(1-y)*np.dot(x.T, x))

In [None]:
w_s = np.random.multivariate_normal(w_new.reshape(-1,), S, 5)
for i in w_s :
  y = sigmoid(np.dot(i, x))
  pred = []
  for i in range(max(y.shape)):
    pred.append(np.argmax([y[i], 1-y[i]]))
  cm = confusion_matrix(t, pred)
  print(cm)

In [None]:
y = sigmoid(np.dot(w_new.T, x)).reshape(-1,)
t_pred = []
for i in range(len(y)):
  t_pred.append(np.argmax([y[i], 1-y[i]]))
cm = confusion_matrix(t, t_pred)
print(cm)
## 80%くらいの正解率

In [None]:
x = df_reg.drop(columns="target")
lr = LogisticRegression()
lr.fit(x, t.ravel())
t_pred = lr.predict(x)
cm = confusion_matrix(t, t_pred)
print(cm)

## sklearn のロジスティック回帰では95%くらい

In [None]:
print(w_new, "\n")
print(lr.coef_)

### <font color = blue>**5.** </font> ライブラリのサンプルコード


In [None]:
# TFP Probabilistic Layers: Regression
# https://www.tensorflow.org/probability/examples/Probabilistic_Layers_Regression


## Copyright 2019 The TensorFlow Probability Authors.
## Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
# In this example we show how to fit regression models using TFP's "probabilistic layers."

In [None]:
## Dependencies & Prerequisites

# Import

from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()

import tensorflow_probability as tfp

sns.reset_defaults()
#sns.set_style('whitegrid')
#sns.set_context('talk')
sns.set_context(context='talk',font_scale=0.7)

# %matplotlib inline

tfd = tfp.distributions

In [None]:
## Motivation
# Wouldn't it be great if we could use TFP to specify a probabilistic model then simply minimize the negative log-likelihood, i.e.,

negloglik = lambda y, rv_y: -rv_y.log_prob(y)

# Well not only is it possible, but this colab shows how! (In context of linear regression problems.)

In [None]:
# Synthesize dataset.

w0 = 0.125
b0 = 5.
x_range = [-20, 60]

def load_dataset(n=150, n_tst=150):
  np.random.seed(43)
  def s(x):
    g = (x - x_range[0]) / (x_range[1] - x_range[0])
    return 3 * (0.25 + g**2.)
  x = (x_range[1] - x_range[0]) * np.random.rand(n) + x_range[0]
  eps = np.random.randn(n) * s(x)
  y = (w0 * x * (1. + np.sin(x)) + b0) + eps
  x = x[..., np.newaxis]
  x_tst = np.linspace(*x_range, num=n_tst).astype(np.float32)
  x_tst = x_tst[..., np.newaxis]
  return y, x, x_tst

y, x, x_tst = load_dataset()

In [None]:
## Case 1: No Uncertainty

# Build model.
model = tf.keras.Sequential([
  tf.keras.layers.Dense(1),
  #tf.keras.layers.Dense(1),
  tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1)),
])

# Do inference.
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)
model.fit(x, y, epochs=1000, verbose=False);

# Profit.
[print(np.squeeze(w.numpy())) for w in model.weights];
yhat = model(x_tst)
assert isinstance(yhat, tfd.Distribution)

In [None]:
## Figure 1: No uncertainty.

w = np.squeeze(model.layers[-2].kernel.numpy())
b = np.squeeze(model.layers[-2].bias.numpy())

#plt.figure(figsize=[6, 1.5])  # inches
plt.figure(figsize=[8, 5])  # inches
plt.plot(x, y, 'b.', label='observed');
plt.plot(x_tst, yhat.mean(),'r', label='mean', linewidth=4);
plt.ylim(-0.,17);
plt.yticks(np.linspace(0, 15, 4)[1:]);
plt.xticks(np.linspace(*x_range, num=9));

ax=plt.gca();
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#ax.spines['left'].set_smart_bounds(True)
#ax.spines['bottom'].set_smart_bounds(True)
plt.legend(loc='center left', fancybox=True, framealpha=0., bbox_to_anchor=(1.05, 0.5))

#plt.savefig('/tmp/fig1.png', bbox_inches='tight', dpi=300)

plt.show()

In [None]:
## Case 2: Aleatoric Uncertainty

# Build model.
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(1 + 1),
                             tfp.layers.DistributionLambda(
                                 lambda t: tfd.Normal(loc=t[..., :1],
                                                      scale=1e-3 + tf.math.softplus(0.05 * t[...,1:]))),
                             ])

# Do inference.
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)
model.fit(x, y, epochs=1000, verbose=False);

# Profit.
[print(np.squeeze(w.numpy())) for w in model.weights];
yhat = model(x_tst)
assert isinstance(yhat, tfd.Distribution)

In [None]:
# Build model.
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(1 + 1),
                             tfp.layers.DistributionLambda(
                                 lambda t: tfd.Normal(loc=t[..., :1],
                                                      scale=1e-3 + tf.math.softplus(0.05 * t[...,1:]))),
                             ])

# Do inference.
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)
model.fit(x, y, epochs=1000, verbose=False);

# Profit.
[print(np.squeeze(w.numpy())) for w in model.weights];
yhat = model(x_tst)
assert isinstance(yhat, tfd.Distribution)

In [None]:
#Figure 2: Aleatoric Uncertainty

#plt.figure(figsize=[6, 1.5])  # inches
plt.figure(figsize=[8, 5])  # inches

plt.plot(x, y, 'b.', label='observed');

m = yhat.mean()
s = yhat.stddev()

plt.plot(x_tst, m, 'r', linewidth=4, label='mean');
plt.plot(x_tst, m + 2 * s, 'g', linewidth=2, label=r'mean + 2 stddev');
plt.plot(x_tst, m - 2 * s, 'g', linewidth=2, label=r'mean - 2 stddev');

plt.ylim(-0.,17);
plt.yticks(np.linspace(0, 15, 4)[1:]);
plt.xticks(np.linspace(*x_range, num=9));

ax=plt.gca();
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#ax.spines['left'].set_smart_bounds(True)
#ax.spines['bottom'].set_smart_bounds(True)
plt.legend(loc='center left', fancybox=True, framealpha=0., bbox_to_anchor=(1.05, 0.5))

#plt.savefig('/tmp/fig2.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
## Case 3: Epistemic Uncertainty

# Specify the surrogate posterior over `keras.layers.Dense` `kernel` and `bias`.
def posterior_mean_field(kernel_size, bias_size=0, dtype=None):
  n = kernel_size + bias_size
  c = np.log(np.expm1(1.))
  return tf.keras.Sequential([
                              tfp.layers.VariableLayer(2 * n, dtype=dtype),
                              tfp.layers.DistributionLambda(lambda t: tfd.Independent(
                                  tfd.Normal(loc=t[..., :n],
                                             scale=1e-5 + tf.nn.softplus(c + t[..., n:])),
                                             reinterpreted_batch_ndims=1)),
                              ])

In [None]:
# Specify the prior over `keras.layers.Dense` `kernel` and `bias`.
def prior_trainable(kernel_size, bias_size=0, dtype=None):
  n = kernel_size + bias_size
  return tf.keras.Sequential([
                              tfp.layers.VariableLayer(n, dtype=dtype),
                              tfp.layers.DistributionLambda(lambda t: tfd.Independent(
                                  tfd.Normal(loc=t, scale=1),
                                  reinterpreted_batch_ndims=1)),
                              ])

In [None]:
# Build model.
model = tf.keras.Sequential([
  tfp.layers.DenseVariational(1, posterior_mean_field, prior_trainable, kl_weight=1/x.shape[0]),
  tfp.layers.DistributionLambda(lambda t: tfd.Normal(loc=t, scale=1)),
])

# Do inference.
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)
model.fit(x, y, epochs=1000, verbose=False);

# Profit.
[print(np.squeeze(w.numpy())) for w in model.weights];
yhat = model(x_tst)
assert isinstance(yhat, tfd.Distribution)

In [None]:
# Figure 3: Epistemic Uncertainty

#plt.figure(figsize=[6, 1.5])  # inches
plt.figure(figsize=[8, 5])  # inches

plt.clf();
plt.plot(x, y, 'b.', label='observed');

yhats = [model(x_tst) for _ in range(100)]
avgm = np.zeros_like(x_tst[..., 0])
for i, yhat in enumerate(yhats):
  m = np.squeeze(yhat.mean())
  s = np.squeeze(yhat.stddev())
  if i < 25:
    plt.plot(x_tst, m, 'g', label='ensemble means' if i == 0 else None, linewidth=0.5)
  avgm += m
plt.plot(x_tst, avgm/len(yhats), 'r', label='overall mean', linewidth=4)

plt.ylim(-0.,17);
plt.yticks(np.linspace(0, 15, 4)[1:]);
plt.xticks(np.linspace(*x_range, num=9));

ax=plt.gca();
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#ax.spines['left'].set_smart_bounds(True)
#ax.spines['bottom'].set_smart_bounds(True)
plt.legend(loc='center left', fancybox=True, framealpha=0., bbox_to_anchor=(1.05, 0.5))

#plt.savefig('/tmp/fig3.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
## Case 4: Aleatoric & Epistemic Uncertainty

# Build model.
model = tf.keras.Sequential([
  tfp.layers.DenseVariational(1 + 1, posterior_mean_field, prior_trainable, kl_weight=1/x.shape[0]),
  tfp.layers.DistributionLambda(
      lambda t: tfd.Normal(loc=t[..., :1],
                           scale=1e-3 + tf.math.softplus(0.01 * t[...,1:]))),
])

# Do inference.
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.01), loss=negloglik)
model.fit(x, y, epochs=1000, verbose=False);

# Profit.
[print(np.squeeze(w.numpy())) for w in model.weights];
yhat = model(x_tst)
assert isinstance(yhat, tfd.Distribution)

In [None]:
# Figure 4: Both Aleatoric & Epistemic Uncertainty

#plt.figure(figsize=[6, 1.5])  # inches
plt.figure(figsize=[8, 5])  # inches

plt.plot(x, y, 'b.', label='observed');

yhats = [model(x_tst) for _ in range(100)]
avgm = np.zeros_like(x_tst[..., 0])
for i, yhat in enumerate(yhats):
  m = np.squeeze(yhat.mean())
  s = np.squeeze(yhat.stddev())
  if i < 15:
    plt.plot(x_tst, m, 'r', label='ensemble means' if i == 0 else None, linewidth=1.)
    plt.plot(x_tst, m + 2 * s, 'g', linewidth=0.5, label='ensemble means + 2 ensemble stdev' if i == 0 else None);
    plt.plot(x_tst, m - 2 * s, 'g', linewidth=0.5, label='ensemble means - 2 ensemble stdev' if i == 0 else None);
  avgm += m
plt.plot(x_tst, avgm/len(yhats), 'r', label='overall mean', linewidth=4)

plt.ylim(-0.,17);
plt.yticks(np.linspace(0, 15, 4)[1:]);
plt.xticks(np.linspace(*x_range, num=9));

ax=plt.gca();
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
#ax.spines['left'].set_smart_bounds(True)
#ax.spines['bottom'].set_smart_bounds(True)
plt.legend(loc='center left', fancybox=True, framealpha=0., bbox_to_anchor=(1.05, 0.5))

#plt.savefig('/tmp/fig4.png', bbox_inches='tight', dpi=300)
plt.show()

### <font color = blue>**6.** </font> ADVI（Automatic Differentiation Variational Inference）

In [None]:
## Variational Inference: Bayesian Neural Networks in PyMC3
## https://docs.pymc.io/notebooks/bayesian_neural_network_advi.html

In [None]:
!pip install Theano==1.0.5
!pip install arviz
!pip install --upgrade pymc3==3.11.2

In [None]:
## Generating data

from warnings import filterwarnings

import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import seaborn as sns
import sklearn
import theano
import theano.tensor as T

from sklearn import datasets
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

print(f"Running on PyMC3 v{pm.__version__}")

In [None]:
%config InlineBackend.figure_format = 'retina'
floatX = theano.config.floatX
filterwarnings("ignore")
#sns.set_style("white")

In [None]:
X, Y = make_moons(noise=0.2, random_state=0, n_samples=1000)
X = scale(X)
X = X.astype(floatX)
Y = Y.astype(floatX)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[Y == 0, 0], X[Y == 0, 1], label="Class 0")
ax.scatter(X[Y == 1, 0], X[Y == 1, 1], color="r", label="Class 1")
sns.despine()
ax.legend()
ax.set(xlabel="X", ylabel="Y", title="Toy binary classification data set");

In [None]:
## Model specification

def construct_nn(ann_input, ann_output):
    n_hidden = 5

    # Initialize random weights between each layer
    init_1 = np.random.randn(X.shape[1], n_hidden).astype(floatX)
    init_2 = np.random.randn(n_hidden, n_hidden).astype(floatX)
    init_out = np.random.randn(n_hidden).astype(floatX)

    with pm.Model() as neural_network:
        # Trick: Turn inputs and outputs into shared variables using the data container pm.Data
        # It's still the same thing, but we can later change the values of the shared variable
        # (to switch in the test-data later) and pymc3 will just use the new data.
        # Kind-of like a pointer we can redirect.
        # For more info, see: http://deeplearning.net/software/theano/library/compile/shared.html
        ann_input = pm.Data("ann_input", X_train)
        ann_output = pm.Data("ann_output", Y_train)

        # Weights from input to hidden layer
        weights_in_1 = pm.Normal("w_in_1", 0, sigma=1, shape=(X.shape[1], n_hidden), testval=init_1)

        # Weights from 1st to 2nd layer
        weights_1_2 = pm.Normal("w_1_2", 0, sigma=1, shape=(n_hidden, n_hidden), testval=init_2)

        # Weights from hidden layer to output
        weights_2_out = pm.Normal("w_2_out", 0, sigma=1, shape=(n_hidden,), testval=init_out)

        # Build neural-network using tanh activation function
        act_1 = pm.math.tanh(pm.math.dot(ann_input, weights_in_1))
        act_2 = pm.math.tanh(pm.math.dot(act_1, weights_1_2))
        act_out = pm.math.sigmoid(pm.math.dot(act_2, weights_2_out))

        # Binary classification -> Bernoulli likelihood
        out = pm.Bernoulli(
            "out",
            act_out,
            observed=ann_output,
            total_size=Y_train.shape[0],  # IMPORTANT for minibatches
        )
    return neural_network


neural_network = construct_nn(X_train, Y_train)

In [None]:
## Variational Inference: Scaling model complexity

#from pymc3.theanof import MRG_RandomStreams, set_tt_rng
from theano.sandbox.rng_mrg import MRG_RandomStream

#set_tt_rng(MRG_RandomStreams(42))
pm.set_tt_rng(MRG_RandomStream(42))

In [None]:
%%time

with neural_network:
    inference = pm.ADVI()
    approx = pm.fit(n=30000, method=inference)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(-inference.hist, label="new ADVI", alpha=0.3)
plt.plot(approx.hist, label="old ADVI", alpha=0.3)
plt.legend()
plt.ylabel("ELBO")
plt.xlabel("iteration");

In [None]:
trace = approx.sample(draws=5000)

In [None]:
# We can get predicted probability from model
neural_network.out.distribution.p

In [None]:
# create symbolic input
x = T.matrix("X")
# symbolic number of samples is supported, we build vectorized posterior on the fly
n = T.iscalar("n")
# Do not forget test_values or set theano.config.compute_test_value = 'off'
x.tag.test_value = np.empty_like(X_train[:10])
n.tag.test_value = 100
_sample_proba = approx.sample_node(
    neural_network.out.distribution.p, size=n, more_replacements={neural_network["ann_input"]: x}
)
# It is time to compile the function
# No updates are needed for Approximation random generator
# Efficient vectorized form of sampling is used
sample_proba = theano.function([x, n], _sample_proba)

# Create bechmark functions
def production_step1():
    pm.set_data(new_data={"ann_input": X_test, "ann_output": Y_test}, model=neural_network)
    ppc = pm.sample_posterior_predictive(
        trace, samples=500, progressbar=False, model=neural_network
    )

    # Use probability of > 0.5 to assume prediction of class 1
    pred = ppc["out"].mean(axis=0) > 0.5


def production_step2():
    sample_proba(X_test, 500).mean(0) > 0.5

In [None]:
%timeit production_step1()

In [None]:
%timeit production_step2()

In [None]:
pred = sample_proba(X_test, 500).mean(0) > 0.5

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X_test[pred == 0, 0], X_test[pred == 0, 1])
ax.scatter(X_test[pred == 1, 0], X_test[pred == 1, 1], color="r")
sns.despine()
ax.set(title="Predicted labels in testing set", xlabel="X", ylabel="Y");

In [None]:
print("Accuracy = {}%".format((Y_test == pred).mean() * 100))

In [None]:
## Lets look at what the classifier has learned

grid = pm.floatX(np.mgrid[-3:3:100j, -3:3:100j])
grid_2d = grid.reshape(2, -1).T
dummy_out = np.ones(grid.shape[1], dtype=np.int8)

In [None]:
ppc = sample_proba(grid_2d, 500)

In [None]:
## Probability surface

cmap = sns.diverging_palette(250, 12, s=85, l=25, as_cmap=True)
fig, ax = plt.subplots(figsize=(16, 9))
contour = ax.contourf(grid[0], grid[1], ppc.mean(axis=0).reshape(100, 100), cmap=cmap)
ax.scatter(X_test[pred == 0, 0], X_test[pred == 0, 1])
ax.scatter(X_test[pred == 1, 0], X_test[pred == 1, 1], color="r")
cbar = plt.colorbar(contour, ax=ax)
_ = ax.set(xlim=(-3, 3), ylim=(-3, 3), xlabel="X", ylabel="Y")
cbar.ax.set_ylabel("Posterior predictive mean probability of class label = 0");

In [None]:
## Uncertainty in predicted value

cmap = sns.cubehelix_palette(light=1, as_cmap=True)
fig, ax = plt.subplots(figsize=(16, 9))
contour = ax.contourf(grid[0], grid[1], ppc.std(axis=0).reshape(100, 100), cmap=cmap)
ax.scatter(X_test[pred == 0, 0], X_test[pred == 0, 1])
ax.scatter(X_test[pred == 1, 0], X_test[pred == 1, 1], color="r")
cbar = plt.colorbar(contour, ax=ax)
_ = ax.set(xlim=(-3, 3), ylim=(-3, 3), xlabel="X", ylabel="Y")
cbar.ax.set_ylabel("Uncertainty (posterior predictive standard deviation)");

In [None]:
## Mini-batch ADVI

minibatch_x = pm.Minibatch(X_train, batch_size=50)
minibatch_y = pm.Minibatch(Y_train, batch_size=50)
neural_network_minibatch = construct_nn(minibatch_x, minibatch_y)
with neural_network_minibatch:
    approx = pm.fit(40000, method=pm.ADVI())

In [None]:
plt.figure(figsize=(12,8))
plt.plot(inference.hist)
plt.ylabel("ELBO")
plt.xlabel("iteration");

In [None]:
pm.traceplot(trace);
#arviz.plot_trace(trace)

## 43. VAE (Variational Autoencoder)

<font color=red size=7>GPUの使用を推奨</font>

### <font color=blue>**1.** </font> 実装例その１

In [None]:
## 出典：https://qiita.com/jun40vn/items/374763f478ee094c5041

#### <font color=green>**1.1.** </font> Autoencoderの実装

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# データセット読み込み
(x_train, _), (x_test, _) = mnist.load_data()

x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))

In [None]:
# モデル構築
encoding_dim = 32
input_img = Input(shape=(784,))

x1 = Dense(256, activation='relu')(input_img)  
x2 = Dense(64, activation='relu')(x1)  

encoded = Dense(encoding_dim, activation='relu')(x2) 

x3 = Dense(64, activation='relu')(encoded)
x4 = Dense(256, activation='relu')(x3)  

decoded = Dense(784, activation='sigmoid')(x4) 

autoencoder = Model(inputs=input_img, outputs=decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

autoencoder.summary()

In [None]:
# 学習
autoencoder.fit(x_train, x_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test))

In [None]:
# 学習モデルでテスト画像を変換
decoded_imgs = autoencoder.predict(x_test)

In [None]:
n = 10
plt.figure(figsize=(10, 2))

for i in range(n):
  # テスト画像を表示
  ax = plt.subplot(2, n, i+1)
  plt.imshow(x_test[i].reshape(28, 28))
  plt.gray()
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # 変換画像を表示
  ax = plt.subplot(2, n, i+1+n)
  plt.imshow(decoded_imgs[i].reshape(28, 28))
  plt.gray()
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

plt.show()

#### <font color=green>**1.2.** </font> VAE全体の実装

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras.datasets import mnist
from keras.losses import mse 
from keras import backend as K
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# データセット読み込み
(x_train, y_train), (x_test, y_test) = mnist.load_data()
image_size = x_train.shape[1] # = 784
original_dim = image_size * image_size
x_train = np.reshape(x_train, [-1, original_dim])
x_test = np.reshape(x_test, [-1, original_dim])
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

input_shape = (original_dim, )
latent_dim = 2 ## 潜在空間の次元

In [None]:
# Reparametrization Trick 
def sampling(args):
  z_mean, z_logvar = args
  batch = K.shape(z_mean)[0]
  dim = K.int_shape(z_mean)[1]
  epsilon = K.random_normal(shape=(batch, dim), seed = 5) # ε
  return z_mean + K.exp(0.5 * z_logvar) * epsilon

In [None]:
# VAEモデル構築
inputs = Input(shape=input_shape)
x1 = Dense(256, activation='relu')(inputs)  
x2 = Dense(64, activation='relu')(x1) 

z_mean = Dense(latent_dim)(x2)
z_logvar = Dense(latent_dim)(x2)

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_logvar])

encoder = Model(inputs, [z_mean, z_logvar, z], name='encoder')
encoder.summary()

In [None]:
latent_inputs = Input(shape=(latent_dim,))
x3 = Dense(64, activation='relu')(latent_inputs)  
x4 = Dense(256, activation='relu')(x3)  

outputs = Dense(original_dim, activation='sigmoid')(x4)

decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

In [None]:
z_output = encoder(inputs)[2]
outputs = decoder(z_output)

vae = Model(inputs, outputs, name='variational_autoencoder')

In [None]:
## 損失関数

# Kullback-Leibler Loss
kl_loss = 1 + z_logvar - K.square(z_mean) - K.exp(z_logvar)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5

# Reconstruction Loss
reconstruction_loss = mse(inputs, outputs)
reconstruction_loss *= original_dim

In [None]:
vae_loss = K.mean(reconstruction_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.fit(x_train,
        epochs=50,
        batch_size=256,
        validation_data=(x_test, None))

In [None]:
# テスト画像を変換
decoded_imgs = vae.predict(x_test)

In [None]:
# テスト画像と変換画像の表示
n = 10
plt.figure(figsize=(10, 2))

for i in range(n):
  # テスト画像を表示
  ax = plt.subplot(2, n, i+1)
  plt.imshow(x_test[i].reshape(28, 28))
  plt.gray()
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # 変換された画像を表示
  ax = plt.subplot(2, n, i+1+n)
  plt.imshow(decoded_imgs[i].reshape(28, 28))
  plt.gray()
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

plt.show()

#### <font color=green>**1.3.** </font> 潜在空間zを平面で表してみる

In [None]:
import matplotlib.cm as cm

In [None]:
def plot_results(encoder,
                 decoder,
                 x_test,
                 y_test,
                 batch_size=128,
                 model_name="vae_mnist"):
  z_mean, _, _ = encoder.predict(x_test,
                                 batch_size=128)
  plt.figure(figsize=(12, 10))
  cmap=cm.tab10
  plt.scatter(z_mean[:, 0], z_mean[:, 1], c=cmap(y_test))
  m = cm.ScalarMappable(cmap=cmap)
  m.set_array(y_test)
  plt.colorbar(m)
  plt.xlabel("z[0]")
  plt.ylabel("z[1]")
  plt.show()

  # (-4, -4) から (4, 4) までを30x30分割してプロットする
  n = 30  # 50>30
  digit_size = 28
  figure = np.zeros((digit_size * n, digit_size * n))
  grid_x = np.linspace(-4, 4, n)
  grid_y = np.linspace(-4, 4, n)[::-1]

  for i, yi in enumerate(grid_y):
    for j, xi in enumerate(grid_x):
      z_sample = np.array([[xi, yi]])
      x_decoded = decoder.predict(z_sample)
      digit = x_decoded[0].reshape(digit_size, digit_size)
      figure[i * digit_size: (i + 1) * digit_size,
             j * digit_size: (j + 1) * digit_size] = digit

  plt.figure(figsize=(10, 10))

  start_range = digit_size // 2
  end_range = n * digit_size + start_range + 1
  pixel_range = np.arange(start_range, end_range, digit_size)

  sample_range_x = np.round(grid_x, 1)
  sample_range_y = np.round(grid_y, 1)

  plt.xticks(pixel_range, sample_range_x)
  plt.yticks(pixel_range, sample_range_y)

  plt.xlabel("z[0]")
  plt.ylabel("z[1]")
  plt.axis('off')

  plt.imshow(figure, cmap='Greys_r')
  #plt.savefig(filename)
  plt.show()

In [None]:
plot_results(encoder,
             decoder,
             x_test,
             y_test,
             batch_size=128,
             model_name="vae_mlp")

### <font color=blue>**2.** </font> 実装例その２

In [None]:
## 出典：　https://qiita.com/MuAuan/items/cdb8ae656da60b6d89ca

#### <font color=green>**2.1.** </font> MNISTのAutoencoder

In [None]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras import backend as K
import matplotlib.pyplot as plt
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Reshape, Embedding,InputLayer

In [None]:
def plot_fig(x_test, decoded_imgs, encoded_imgs,k):
  n = 10
  plt.figure(figsize=(10, 16))
  for j in range(0,n):
    for i in range(1,n+1):
      # display original
      ax1 = plt.subplot(20, n*1, i+10*2*j)
      ax1.imshow(x_test[i+10*j].reshape(28, 28))
      plt.gray()
      ax1.get_xaxis().set_visible(False)
      ax1.get_yaxis().set_visible(False)

      # display reconstruction
      ax2 = plt.subplot(20, n*1, i + (2*j+1)*10)
      ax2.imshow(decoded_imgs[i+10*j].reshape(28, 28))
      plt.gray()
      ax2.get_xaxis().set_visible(False)
      ax2.get_yaxis().set_visible(False)

  plt.savefig("./mnist1000/mnist_training_by_100_10_{}".format(k))    
  plt.pause(0.01)
  plt.close()

  n = 100
  plt.figure(figsize=(10, 16))
  for i in range(1,n+1):
    ax = plt.subplot(10, n*0.1, i)
    plt.imshow(encoded_imgs[i].reshape(8, 2 * 8).T)
    plt.gray()
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

  plt.savefig("/content/mnist1000/mnist_intermid_training_by_100_10_{}".format(k))  ##
  plt.pause(0.01)
  plt.close()

In [None]:
!mkdir /content/mnist1000/

In [None]:
input_img = Input(shape=(28, 28, 1))  # adapt this if using `channels_first` image data format

x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = MaxPooling2D((2, 2), padding='same',name='encoded')(x)
encoder=Model(input_img, encoded)
encoder.summary()

In [None]:
x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = Model(input_img, decoded)
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
autoencoder.summary()

In [None]:
from keras.datasets import mnist
import numpy as np

(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train[:1000].astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = np.reshape(x_train[:1000], (len(x_train[:1000]), 28, 28, 1))
x_test = np.reshape(x_test, (len(x_test), 28, 28, 1))
y_train=y_train[:1000]

In [None]:
for j in range(10):
  x_train1 = x_train
  x_test1 = x_test

  autoencoder.fit(x_train1, x_train1,
                  epochs=10,
                  batch_size=128,
                  shuffle=True,
                  validation_data=(x_test1, x_test1)
                  )

  decoded_imgs = autoencoder.predict(x_test)
  encoded_imgs = encoder.predict(x_test)
  
  plot_fig(x_test,decoded_imgs,encoded_imgs,j)

#### <font color=green>**2.2.** </font> MNISTのVAE

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras.datasets import mnist
from keras.losses import mse, binary_crossentropy
from keras.utils import plot_model
from keras import backend as K

import numpy as np
import matplotlib.pyplot as plt
import argparse
import os
import cv2

from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import Dropout, Activation, Flatten
from keras.layers import Reshape, Embedding,InputLayer

In [None]:
# reparameterization trick
# instead of sampling from Q(z|X), sample eps = N(0,I)
# z = z_mean + sqrt(var)*eps
def sampling(args):
  """Reparameterization trick by sampling fr an isotropic unit Gaussian.
  # Arguments
      args (tensor): mean and log of variance of Q(z|X)
  # Returns
      z (tensor): sampled latent vector
  """
  z_mean, z_log_var = args
  batch = K.shape(z_mean)[0]
  dim = K.int_shape(z_mean)[1]
  # by default, random_normal has mean=0 and std=1.0
  epsilon = K.random_normal(shape=(batch, dim))
  return z_mean + K.exp(0.5 * z_log_var) * epsilon

In [None]:
def plot_results(models,
                 data,
                 batch_size=128,
                 model_name="vae_mnist"):
  """Plots labels and MNIST digits as function of 2-dim latent vector
  # Arguments
      models (tuple): encoder and decoder models
      data (tuple): test data and label
      batch_size (int): prediction batch size
      model_name (string): which model is using this function
  """
  encoder, decoder = models
  x_test, y_test = data
  os.makedirs(model_name, exist_ok=True)

  filename1 = "/content/mnist1000/vae_mean_all.png"  ##
  # display a 2D plot of the digit classes in the latent space
  z_mean, _, _ = encoder.predict(x_test, batch_size=batch_size)
  plt.figure(figsize=(12, 10))
  plt.scatter(z_mean[:, 0], z_mean[:, 1], c=y_test)
  plt.colorbar()
  plt.xlabel("z[0]")
  plt.ylabel("z[1]")
  plt.savefig(filename1)
  plt.show()

  filename2 = "/content/mnist1000/digits_over_latent_all.png"  ##
  # display a 30x30 2D manifold of digits
  n = 30
  digit_size = 28
  figure = np.zeros((digit_size * n, digit_size * n))
  # linearly spaced coordinates corresponding to the 2D plot
  # of digit classes in the latent space
  grid_x = np.linspace(-4, 4, n)
  grid_y = np.linspace(-4, 4, n)[::-1]

  for i, yi in enumerate(grid_y):
    for j, xi in enumerate(grid_x):
      z_sample = np.array([[xi, yi]])
      x_decoded = decoder.predict(z_sample)
      digit = x_decoded[0].reshape(digit_size, digit_size)
      figure[i * digit_size: (i + 1) * digit_size,
             j * digit_size: (j + 1) * digit_size] = digit

  plt.figure(figsize=(10, 10))
  start_range = digit_size // 2
  end_range = n * digit_size + start_range + 1
  pixel_range = np.arange(start_range, end_range, digit_size)
  sample_range_x = np.round(grid_x, 1)
  sample_range_y = np.round(grid_y, 1)
  plt.xticks(pixel_range, sample_range_x)
  plt.yticks(pixel_range, sample_range_y)
  plt.xlabel("z[0]")
  plt.ylabel("z[1]")
  plt.imshow(figure, cmap='Greys_r')
  plt.savefig(filename2)
  plt.show()

In [None]:
def plot_results2(models,
                  data,
                  batch_size=128,
                  model_name="vae_mnist"):
  z0=[-0.7,-3]
  z7=[-0.7,2]
  for t in range(50):
    s=t/50
    z_sample=np.array([[s*(-0.7)+(1-s)*(-0.7),s*(-3)+(1-s)*2]])
    x_decoded = decoder.predict(z_sample)
    plt.imshow(x_decoded.reshape(28, 28))
    plt.title("z_sample="+str(z_sample))
    plt.savefig('/content/mnist1000/z_sample_t{}'.format(t)) ##
    plt.show()
    plt.close()

In [None]:
# MNIST dataset
#(x_train, _), (x_test, _) = mnist.load_data()
(x_train, y_train), (x_test, y_test) = mnist.load_data()
image_size = x_train.shape[1]
original_dim = image_size * image_size
x_train = x_train[:60000].astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = np.reshape(x_train[:60000], (len(x_train[:60000]), 28, 28, 1))
x_test = np.reshape(x_test, (len(x_test), 28, 28, 1)) 
y_train=y_train[:60000]

In [None]:
# network parameters
#input_shape = (original_dim, )
input_shape = (image_size, image_size, 1)
intermediate_dim = 512
batch_size = 64 ## 128 -> 64
latent_dim = 2
epochs = 20  ### 100 -> 20

In [None]:
# VAE model = encoder + decoder
# build encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = Conv2D(16, (3, 3), activation='relu', padding='same')(inputs)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same',name='encoded')(x)
shape = K.int_shape(x)
print("shape[1], shape[2], shape[3]",shape[1], shape[2], shape[3])
x = Flatten()(x)

z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

In [None]:
# build decoder model
# decoder
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(shape[1] * shape[2] * shape[3], activation='relu')(latent_inputs)
x = Reshape((shape[1], shape[2], shape[3]))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
x = Conv2D(16, (3, 3), activation='relu')(x)
x = UpSampling2D((2, 2))(x)
outputs = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

In [None]:
# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')

# loss関数
# Compute VAE loss
reconstruction_loss = binary_crossentropy(K.flatten(inputs),
                                          K.flatten(outputs))
reconstruction_loss *= image_size * image_size
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

In [None]:
x_train1 = x_train
x_test1 = x_test

#vae.load_weights('vae_mnist_weights_100.h5')
#encoder.load_weights('encoder_mnist_weights_100.h5')
#decoder.load_weights('decoder_mnist_weights_100.h5')

In [None]:
# autoencoderの実行
vae.fit(x_train1,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(x_test1, None))

models = (encoder, decoder)
data = (x_test, y_test)

plot_results(models,
             data,
             batch_size=batch_size,
             model_name="vae_mlp")

plot_results2(models,
              data,
              batch_size=batch_size,
              model_name="vae_mlp")

vae.save_weights('vae_mnist_weights_100.h5')
encoder.save_weights('encoder_mnist_weights_100.h5')
decoder.save_weights('decoder_mnist_weights_100.h5')

In [None]:
# 実行結果の表示
n = 10
decoded_imgs = vae.predict(x_test[:n])

plt.figure(figsize=(10, 4))
for i in range(n):
  # original_image
  orig_img = x_test[i].reshape(image_size, image_size)

  # reconstructed_image
  reconst_img = decoded_imgs[i].reshape(image_size, image_size)

  # diff image
  diff_img = ((orig_img - reconst_img)+2)/4
  diff_img = (diff_img*255).astype(np.uint8)
  orig_img = (orig_img*255).astype(np.uint8)
  reconst_img = (reconst_img*255).astype(np.uint8)
  diff_img_color = cv2.applyColorMap(diff_img, cv2.COLORMAP_JET)

  # display original
  ax = plt.subplot(3, n,  i + 1)
  plt.imshow(orig_img, cmap=plt.cm.gray)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # display reconstruction
  ax = plt.subplot(3, n, i + n + 1)
  plt.imshow(reconst_img, cmap=plt.cm.gray)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # display diff
  ax = plt.subplot(3, n, i + n*2 + 1)
  plt.imshow(diff_img, cmap=plt.cm.jet)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

plt.savefig("autodetect_all.jpg")
plt.pause(1)
plt.close()

In [None]:
# 学習結果の保存
vae.save('./ae_mnist.h5')

# json and weights
model_json = vae.to_json()
with open('ae_mnist.json', 'w') as json_file:
  json_file.write(model_json)
vae.save_weights('ae_mnist_weights.h5')

#### <font color=green>**2.3.** </font> 異常検知について

In [None]:
## 以下のコードのように学習を７のみに限定して学習すると、
## そのz空間での様子を見るとほぼ全領域で７のような形状になっています

# 学習に使うデータを限定する
x_train1 = x_train[y_train==7]
x_test1 = x_test[y_test==7]

batch_size = 128
epochs = 100

vae.fit(x_train1,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(x_test1, None))

In [None]:
models = (encoder, decoder)
data = (x_test, y_test)

plot_results(models,
             data,
             batch_size=batch_size,
             model_name="vae_mlp")

plot_results2(models,
              data,
              batch_size=batch_size,
              model_name="vae_mlp")

vae.save_weights('vae_mnist_weights_100.h5')
encoder.save_weights('encoder_mnist_weights_100.h5')
decoder.save_weights('decoder_mnist_weights_100.h5')

In [None]:
# 実行結果の表示
n = 10
decoded_imgs = vae.predict(x_test[:n])

plt.figure(figsize=(10, 4))
for i in range(n):
  # original_image
  orig_img = x_test[i].reshape(image_size, image_size)

  # reconstructed_image
  reconst_img = decoded_imgs[i].reshape(image_size, image_size)

  # diff image
  diff_img = ((orig_img - reconst_img)+2)/4
  diff_img = (diff_img*255).astype(np.uint8)
  orig_img = (orig_img*255).astype(np.uint8)
  reconst_img = (reconst_img*255).astype(np.uint8)
  diff_img_color = cv2.applyColorMap(diff_img, cv2.COLORMAP_JET)

  # display original
  ax = plt.subplot(3, n,  i + 1)
  plt.imshow(orig_img, cmap=plt.cm.gray)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # display reconstruction
  ax = plt.subplot(3, n, i + n + 1)
  plt.imshow(reconst_img, cmap=plt.cm.gray)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

  # display diff
  ax = plt.subplot(3, n, i + n*2 + 1)
  plt.imshow(diff_img, cmap=plt.cm.jet)
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)

plt.savefig("autodetect_all.jpg")
plt.pause(1)
plt.close()

## 44. サンプリング法

### <font color=blue>**1.** </font> NUTS（No-U-Turn Sampler）の簡単な例

In [None]:
## Variational API quickstart
## https://docs.pymc.io/notebooks/variational_api_quickstart.html

In [None]:
!pip install Theano==1.0.5
!pip install arviz
!pip install --upgrade pymc3==3.11.2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import theano

np.random.seed(42)
pm.set_tt_rng(42)

In [None]:
## Basic setup

w = pm.floatX([0.2, 0.8])
mu = pm.floatX([-0.3, 0.5])
sd = pm.floatX([0.1, 0.1])

with pm.Model() as model:
  x = pm.NormalMixture("x", w=w, mu=mu, sigma=sd, dtype=theano.config.floatX)
  x2 = x ** 2
  sin_x = pm.math.sin(x)

In [None]:
with model:
  pm.Deterministic("x2", x2)
  pm.Deterministic("sin_x", sin_x)

In [None]:
with model:
  trace = pm.sample(50000)

In [None]:
#pm.traceplot(trace)

import arviz
arviz.plot_trace(trace);

In [None]:
with pm.Model() as model:
  x = pm.NormalMixture("x", w=w, mu=mu, sigma=sd, dtype=theano.config.floatX)
  x2 = x ** 2
  sin_x = pm.math.sin(x)

In [None]:
with model:
  mean_field = pm.fit(method="advi")

In [None]:
pm.plot_posterior(mean_field.sample(1000), color="LightSeaGreen");

In [None]:
## Checking convergence

help(pm.callbacks.CheckParametersConvergence)

In [None]:
from pymc3.variational.callbacks import CheckParametersConvergence

with model:
  mean_field = pm.fit(method="advi", callbacks=[CheckParametersConvergence()])

In [None]:
plt.figure(figsize=(12,8))
plt.plot(mean_field.hist);

In [None]:
with model:
  mean_field = pm.fit(
      method="advi", callbacks=[pm.callbacks.CheckParametersConvergence(diff="absolute")]
      )

In [None]:
plt.figure(figsize=(12,8))
plt.plot(mean_field.hist);

In [None]:
## Tracking parameters

with model:
  advi = pm.ADVI()

In [None]:
advi.approx

In [None]:
advi.approx.shared_params

In [None]:
advi.approx.mean.eval(), advi.approx.std.eval()

In [None]:
tracker = pm.callbacks.Tracker(
    mean=advi.approx.mean.eval,  # callable that returns mean
    std=advi.approx.std.eval,  # callable that returns std
    )

In [None]:
approx = advi.fit(20000, callbacks=[tracker])

In [None]:
fig = plt.figure(figsize=(16, 9))
mu_ax = fig.add_subplot(221)
std_ax = fig.add_subplot(222)
hist_ax = fig.add_subplot(212)
mu_ax.plot(tracker["mean"])
mu_ax.set_title("Mean track")
std_ax.plot(tracker["std"])
std_ax.set_title("Std track")
hist_ax.plot(advi.hist)
hist_ax.set_title("Negative ELBO track");

In [None]:
advi.refine(100000)

In [None]:
fig = plt.figure(figsize=(16, 9))
mu_ax = fig.add_subplot(221)
std_ax = fig.add_subplot(222)
hist_ax = fig.add_subplot(212)
mu_ax.plot(tracker["mean"])
mu_ax.set_title("Mean track")
std_ax.plot(tracker["std"])
std_ax.set_title("Std track")
hist_ax.plot(advi.hist)
hist_ax.set_title("Negative ELBO track");

In [None]:
import seaborn as sns

plt.figure(figsize=(12,8))
ax = sns.kdeplot(trace["x"], label="NUTS")
sns.kdeplot(approx.sample(10000)["x"], label="ADVI");

In [None]:
with model:
  svgd_approx = pm.fit(
      300,
      method="svgd",
      inf_kwargs=dict(n_particles=1000),
      obj_optimizer=pm.sgd(learning_rate=0.01)
      )

In [None]:
plt.figure(figsize=(12,8))
ax = sns.kdeplot(trace["x"], label="NUTS")
sns.kdeplot(approx.sample(10000)["x"], label="ADVI")
sns.kdeplot(svgd_approx.sample(2000)["x"], label="SVGD");

In [None]:
# recall x ~ NormalMixture
a = x ** 2
b = pm.math.sin(x)

In [None]:
help(svgd_approx.sample_node)

In [None]:
a_sample = svgd_approx.sample_node(a)
a_sample.eval()

In [None]:
a_sample.eval()

In [None]:
a_sample.eval()

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(np.array([a_sample.eval() for _ in range(2000)]))
plt.title("$x^2$ distribution");

In [None]:
a_samples = svgd_approx.sample_node(a, size=1000)

In [None]:
plt.figure(figsize=(12,8))
sns.kdeplot(a_samples.eval())
plt.title("$x^2$ distribution");

In [None]:
a_samples.var(0).eval()  # variance

In [None]:
a_samples.mean(0).eval()  # mean

In [None]:
i = theano.tensor.iscalar("i")
i.tag.test_value = 1
a_samples_i = svgd_approx.sample_node(a, size=i)

In [None]:
a_samples_i.eval({i: 100}).shape

In [None]:
a_samples_i.eval({i: 10000}).shape

In [None]:
## Converting a Trace to an Approximation

trace_approx = pm.Empirical(trace, model=model)
trace_approx

In [None]:
pm.plot_posterior(trace_approx.sample(10000));

In [None]:
## Multilabel logistic regression

import pandas as pd
import theano.tensor as tt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
Xt = theano.shared(X_train)
yt = theano.shared(y_train)

with pm.Model() as iris_model:
  # Coefficients for features
  β = pm.Normal("β", 0, sigma=1e2, shape=(4, 3))
  # Transoform to unit interval
  a = pm.Flat("a", shape=(3,))
  p = tt.nnet.softmax(Xt.dot(β) + a)

  observed = pm.Categorical("obs", p=p, observed=yt)

In [None]:
## Applying replacements in practice

with iris_model:
  # We'll use SVGD
  inference = pm.SVGD(n_particles=500, jitter=1)

  # Local reference to approximation
  approx = inference.approx

  # Here we need `more_replacements` to change train_set to test_set
  test_probs = approx.sample_node(p, more_replacements={Xt: X_test}, size=100)

  # For train set no more replacements needed
  train_probs = approx.sample_node(p)

In [None]:
test_ok = tt.eq(test_probs.argmax(-1), y_test)
train_ok = tt.eq(train_probs.argmax(-1), y_train)
test_accuracy = test_ok.mean(-1)
train_accuracy = train_ok.mean(-1)

In [None]:
eval_tracker = pm.callbacks.Tracker(
    test_accuracy=test_accuracy.eval, train_accuracy=train_accuracy.eval
    )

In [None]:
inference.fit(100, callbacks=[eval_tracker]);

In [None]:
_, ax = plt.subplots(1, 1, figsize=(12,8))
df = pd.DataFrame(eval_tracker["test_accuracy"]).T.melt()
sns.lineplot(x="variable", y="value", data=df, color="red", ax=ax)
ax.plot(eval_tracker["train_accuracy"], color="blue")
ax.set_xlabel("epoch")
plt.legend(["test_accuracy", "train_accuracy"])
plt.title("Training Progress")
plt.show()

In [None]:
inference.fit(400, obj_optimizer=pm.adamax(learning_rate=0.1), callbacks=[eval_tracker]);

In [None]:
_, ax = plt.subplots(1, 1, figsize=(12,8))
df = pd.DataFrame(np.asarray(eval_tracker["test_accuracy"])).T.melt()
sns.lineplot(x="variable", y="value", data=df, color="red", ax=ax)
ax.plot(eval_tracker["train_accuracy"], color="blue")
ax.set_xlabel("epoch")
plt.legend(["test_accuracy", "train_accuracy"])
plt.title("Training Progress");

In [None]:
## Minibatches

issubclass(pm.Minibatch, theano.tensor.TensorVariable)

In [None]:
# Raw values
data = np.random.rand(40000, 100)
# Scaled values
data *= np.random.randint(1, 10, size=(100,))
# Shifted values
data += np.random.rand(100) * 10

In [None]:
with pm.Model() as model:
  mu = pm.Flat("mu", shape=(100,))
  sd = pm.HalfNormal("sd", shape=(100,))
  lik = pm.Normal("lik", mu, sd, observed=data)

In [None]:
def stop_after_10(approx, loss_history, i):
  if (i > 0) and (i % 10) == 0:
    raise StopIteration("I was slow, sorry")

In [None]:
with model:
  advifit = pm.fit(callbacks=[stop_after_10])

In [None]:
X = pm.Minibatch(data, batch_size=500)

with pm.Model() as model:
  mu = pm.Flat("mu", shape=(100,))
  sd = pm.HalfNormal("sd", shape=(100,))
  likelihood = pm.Normal("likelihood", mu, sd, observed=X, total_size=data.shape)

In [None]:
with model:
  advifit = pm.fit()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(advifit.hist);

In [None]:
print(pm.Minibatch.__doc__)

### <font color=blue>**2.** </font> PyMC3 でベイズ統計モデリング

https://qiita.com/0NE_shoT_/items/2b41ae3e8e8f2d8809c4

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
## データの生成

def generate_sample_data(num, seed=0):
  target_list = [] # 目的変数のリスト
  feature_vector_list = [] # 説明変数（特徴量）のリスト

  feature_num = 8 # 特徴量の数
  intercept = 0.2 # 切片
  weight = [0.2, 0.3, 0.5, -0.4, 0.1, 0.2, 0.5, -0.3] # 各特徴量の重み

  np.random.seed(seed=seed)
  for i in range(num):
    feature_vector = [np.random.rand() for n in range(feature_num)] # 特徴量をランダムに生成
    noise = [np.random.normal(0, 0.1) for n in range(feature_num)] # ノイズをランダムに生成
    target = sum([intercept+feature_vector[n]*weight[n]+noise[n] for n in range(feature_num)]) # 目的変数を生成

    target_list.append(target)
    feature_vector_list.append(feature_vector)

  df = pd.DataFrame(np.c_[target_list, feature_vector_list],
                    columns=['target', 'feature0', 'feature1', 'feature2',
                             'feature3', 'feature4', 'feature5', 'feature6', 'feature7']
                    )
  return df

In [None]:
data = generate_sample_data(num=1000, seed=0)

X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## パラメータの事後分布と検証用データ X_test に対する目的変数の予測分布を算出することを目標とする

In [None]:
# PyMC3 は、Python の文法の枠内で統計モデリングができるライブラリです
# 行列操作や微分などの数式処理ができる Theano を内部で利用することで、確率分布の計算の高速化を図っています
# Stan と同様に、 NUTS アルゴリズムによるサンプリングや ADVI による変分推論が可能です

# Theano -> https://www.sejuku.net/blog/64336

In [None]:
!pip install Theano==1.0.5
!pip install arviz
!pip install --upgrade pymc3==3.11.2

In [None]:
import theano
import arviz
import pymc3 as pm

#print(theano.__version__)
#print(arviz.__version__)
#print(pm.__version__)

In [None]:
## モデル構築

## 学習データを Theano の共有変数としている理由は、検証用データに対する予測分布を計算する際に、モデル（＝パラメータの事後分布）を再利用するため
# 学習用データの説明変数
X_shared = theano.shared(X_train.values)

# 学習用データの目的変数
y_shared = theano.shared(y_train.values)

with pm.Model() as linear_model:
  ## w0,w1,…,wD を(−∞,∞)の一様分布に設定
  w0 = pm.Flat('w0')
  w = pm.Flat('w', shape=X_shared.get_value().shape[1])
  
  ## σ は (0,∞)の一様分布に設定
  sigma = pm.HalfFlat('sigma')

  ## 目的変数の確率モデル式を記述
  y_obs = pm.Normal('y_obs',
                    mu=w0+pm.math.dot(X_shared,w), 
                    sigma=sigma, 
                    observed=y_shared, 
                    shape=y_shared.get_value().shape[0])
  
  ## サンプリングを実行
  trace = pm.sample(
      500,        ## iteration ステップ数を 1000
      tune=500,   ## warm up ステップ数
      cores=4     ## chain 数
      )

In [None]:
## 各パラメータについて得られたサンプル列の取得
trace.get_values('w0', chains=0)

In [None]:
## 各パラメータの事後分布と trace plot を描画

#pm.traceplot(trace)
arviz.plot_trace(trace)
plt.show()

In [None]:
## サンプリングアルゴリズムの収束の度合いを表す指標を確認

#pm.gelman_rubin(trace)
arviz.summary(trace)

In [None]:
## 各パラメータについて、各chain のベイズ信頼区間を表示

#pm.forestplot(trace)
arviz.plot_forest(trace)
plt.show()

In [None]:
X_shared.set_value(X_test)
y_shared.set_value(np.zeros(X_test.shape[0],)) # 目的変数を初期化

with linear_model:
  ## 予測分布（=検証用データに対する目的変数のサンプル）
  post_pred = pm.sample_posterior_predictive(trace, samples=1000)

In [None]:
## サンプリング結果から求めた事後平均を予測値とすれば、機械学習による予測タスクと同様に精度検証することも可能
y_pred = post_pred['y_obs'].mean(axis=0)
print('MSE(test) = {:.2f}'.format(mean_squared_error(y_test, y_pred)))
print('R^2(test) = {:.2f}'.format(r2_score(y_test, y_pred)))

### <font color=blue>**3.** </font> ギブスサンプリング（MCMC : Markov chain Monte Carlo methods）による画像のノイズ除去

In [None]:
## 出典: https://ichi.pro/gibusu-sanpuringu-mcmc-niyoru-gazo-no-noizu-jokyo-19602944876883

In [None]:
import matplotlib.pyplot as plt
import cv2

In [None]:
img = cv2.imread("/content/img.png", 0)
img_noisy = cv2.imread("/content/img_noisy.png", 0)

plt.figure(figsize=(16,7))
plt.subplot(1,2,1)
plt.imshow(img, cmap = 'gray')
plt.subplot(1,2,2)
plt.imshow(img_noisy, cmap = 'gray')
plt.show()

In [None]:
import math
import numpy as np
import time

In [None]:
def load_image(filename):
  ## PNG画像をnumpy配列に読み取り
  my_img = plt.imread(filename)
  
  ## グレースケールに変換
  img_gray = np.dot(my_img[..., :3], [0.2989, 0.5870, 0.1140])
  
  ## ピクセルを{-1、1}に再スケーリング
  img_gray = np.where(img_gray > 0.5, 1, -1)
  
  ## 各ピクセルの隣接ピクセルを検索するときにコーナーケースを処理できるように、エッジに0個のパディングを追加
  img_padded = np.zeros([img_gray.shape[0] + 2, img_gray.shape[1] + 2])
  img_padded[1:-1, 1:-1] = img_gray
  return img_padded

$\log{P(Y|X)} = \log{P(X|Y)} + \log{P(Y)} - \log{P(X)}$

$\displaystyle P(Y, X) = \dfrac{1}{Z} \exp \left( \eta \sum_{i=1}^{N} \sum_{j=1}^{M}{x_{ij}y_{ij}} + \beta \sum_{i'j' \in N(ij)}^{} {y_{ij}y_{i'j'}} \right)$

In [None]:
def sample_y(i, j, Y, X):
  ## 行と列のインデックス i と j
  ## 復元された画像配列Y
  ## ノイズの多い画像配列X

  ## yij の近傍 yij_neighbors を検索し、条件付き確率P（yij = 1 | yij_neighbors）を計算
  markov_blanket = [Y[i - 1, j], Y[i, j - 1], Y[i, j + 1], Y[i + 1, j], X[i, j]]
  w = ETA * markov_blanket[-1] + BETA * sum(markov_blanket[:4])

  ## 条件付き確率でサンプリングされた yij の値（1または-1）を返す
  prob = 1 / (1 + math.exp(-2*w))
  return (np.random.rand() < prob) * 2 - 1

$\displaystyle P(y_{ij} = 1 | y_{N(ij)}, x_{i, j}) = \cdots = \dfrac{1}{1 + \exp (-2w_{ij})}$

$\displaystyle w_{ij} = \eta x_{ij} + \beta \sum_{N(ij)} y_{N(ij)}$

In [None]:
def get_posterior(filename, burn_in_steps, total_samples, logfile):
  ## ノイズの多い画像Xをロード
  X = load_image(filename)
  
  posterior = np.zeros(X.shape)
  print("img shape: {}".format(X.shape))
  
  ## 復元された画像Yをランダムに初期化
  Y = np.random.choice([1, -1], size=X.shape)
  energy_list = list()
  
  ## Yをサンプリングし、事後確率P（Y | Y_neighbor）を計算
  for step in range(burn_in_steps + total_samples):
    if step % 10 == 0:
      print("{}th step start".format(step+1))
    for i in range(1, Y.shape[0]-1):
      for j in range(1, Y.shape[1]-1):
        ## Yの各ピクセルをサンプリング
        y = sample_y(i, j, Y, X)

        ## サンプリングされた値でYを更新
        Y[i, j] = y

        ## バーンイン期間が終了すると、Yのyijについて、yij = 1というイベントの発生総数を合計
        if y == 1 and step >= burn_in_steps:
          posterior[i, j] += 1
    ## 収束を視覚化できるように、エネルギーを追跡
    energy = -np.sum(np.multiply(Y, X))*ITA-(np.sum(np.multiply(Y[:-1], Y[1:]))+np.sum(np.multiply(Y[:, :-1], Y[:, 1:])))*BETA
    if step < burn_in_steps:
      energy_list.append(str(step) + "\t" + str(energy) + "\tB")
    else:
      energy_list.append(str(step) + "\t" + str(energy) + "\tS")
  ## サンプリングが完了したら、モンテカルロ法を使用して事後確率を取得
  ## 事後確率は、基本的にYの集計値を合計サンプル数で除算
  posterior = posterior / total_samples

  file = open(logfile, 'w')
  for element in energy_list:
    file.writelines(element)
    file.write('\n')
  file.close()
  return posterior

In [None]:
## 入力関数

def denoise_image(filename, burn_in_steps, total_samples, logfile):
  ## 推定事後確率p（Y = 1 | Y_neighbor）を取得
  posterior = get_posterior(filename, burn_in_steps, total_samples, logfile=logfile)
  
  denoised = np.zeros(posterior.shape, dtype=np.float64)
  
  ## しきい値を0.5に設定すると、復元された画像配列Yを後方から取得
  denoised[posterior > 0.5] = 1
  
  ## 画像配列のエッジを取り除いて返す
  return denoised[1:-1, 1:-1]

In [None]:
def plot_energy(filename):
  x = np.genfromtxt(filename, dtype=None, encoding='utf8')
  its, energies, phases = zip(*x)
  its = np.asarray(its)
  energies = np.asarray(energies)
  phases = np.asarray(phases)
  burn_mask = (phases == 'B')
  samp_mask = (phases == 'S')
  assert np.sum(burn_mask) + np.sum(samp_mask) == len(x), 'Found bad phase'
  its_burn, energies_burn = its[burn_mask], energies[burn_mask]
  its_samp, energies_samp = its[samp_mask], energies[samp_mask]
  p1, = plt.plot(its_burn, energies_burn, 'r')
  p2, = plt.plot(its_samp, energies_samp, 'b')
  plt.title("energy")
  plt.xlabel('iteration number')
  plt.ylabel('energy')
  plt.legend([p1, p2], ['burn in', 'sampling'])
  plt.show()  ###

  plt.savefig('%s.png' % filename[:-4])
  plt.close()

In [None]:
def save_image(denoised_image):
  plt.figure(figsize=(8,7))  ###
  plt.imshow(denoised_image, cmap='gray')
  plt.title("denoised image")
  plt.show()  ###

  plt.savefig('/content/denoise_image.png') ###
  plt.close()

In [None]:
## ハイパーパラメータ η と β
ETA = 1
BETA = 1

## サンプリングステップ
total_samples = 180  ###

## 書き込みステップ
burn_in_steps = 20 ###

logfile = "/content/log_energy.txt" ###

In [None]:
time1 = time.time()
denoised_img = denoise_image("/content/img_noisy.png",  ###
                             burn_in_steps = burn_in_steps,
                             total_samples = total_samples, 
                             logfile = logfile
                             )
print("total time: {}".format(time.time() - time1))
save_image(denoised_img)

In [None]:
# log = open("/content/log_energy.txt")
plot_energy(logfile)

### <font color=blue>**4.** </font> STS（structural time series） models with non-Gaussian observations

In [None]:
## https://www.tensorflow.org/probability/examples/STS_approximate_inference_for_models_with_non_Gaussian_observations

## Copyright 2019 The TensorFlow Probability Authors.
## Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
## Approximate inference for STS(structural time series) models with non-Gaussian observations

# This notebook demonstrates the use of TFP approximate inference tools to
# incorporate a (non-Gaussian) observation model when fitting and forecasting
# with structural time series (STS) models.
# In this example, we'll use a Poisson observation model to work with discrete count data.

In [None]:
import time
import matplotlib.pyplot as plt
import numpy as np

import tensorflow.compat.v2 as tf
import tensorflow_probability as tfp

from tensorflow_probability import bijectors as tfb
from tensorflow_probability import distributions as tfd

tf.enable_v2_behavior()

In [None]:
## Synthetic Data
# First we'll generate some synthetic count data:

num_timesteps = 30
observed_counts = np.round(3 + np.random.lognormal(np.log(np.linspace(
    num_timesteps, 5, num=num_timesteps)), 0.20, size=num_timesteps)) 
observed_counts = observed_counts.astype(np.float32)
plt.plot(observed_counts)
plt.show()

In [None]:
## Model
# We'll specify a simple model with a randomly walking linear trend:

def build_model(approximate_unconstrained_rates):
  trend = tfp.sts.LocalLinearTrend(
      observed_time_series=approximate_unconstrained_rates)
  return tfp.sts.Sum([trend],
                     observed_time_series=approximate_unconstrained_rates)

Instead of operating on the observed time series, this model will operate on the series of Poisson rate parameters that govern the observations.

Since Poisson rates must be positive, we'll use a bijector to transform the
real-valued STS model into a distribution over positive values. The `Softplus`
transformation $y = \log(1 + \exp(x))$ is a natural choice, since it is nearly linear for positive values, but other choices such as `Exp` (which transforms the normal random walk into a lognormal random walk) are also possible.

In [None]:
positive_bijector = tfb.Softplus()  # Or tfb.Exp()

# Approximate the unconstrained Poisson rate just to set heuristic priors.
# We could avoid this by passing explicit priors on all model params.
approximate_unconstrained_rates = positive_bijector.inverse(
    tf.convert_to_tensor(observed_counts) + 0.01)
sts_model = build_model(approximate_unconstrained_rates)

To use approximate inference for a non-Gaussian observation model,
we'll encode the STS model as a TFP JointDistribution. The random variables in this joint distribution are the parameters of the STS model, the time series of latent Poisson rates, and the observed counts.


In [None]:
Root = tfd.JointDistributionCoroutine.Root
def sts_with_poisson_likelihood_model():
  # Encode the parameters of the STS model as random variables.
  param_vals = []
  for param in sts_model.parameters:
    param_val = yield Root(param.prior)
    param_vals.append(param_val)

  # Use the STS model to encode the log- (or inverse-softplus)
  # rate of a Poisson.
  unconstrained_rate = yield sts_model.make_state_space_model(
      num_timesteps, param_vals)
  rate = positive_bijector.forward(unconstrained_rate[..., 0])
  observed_counts = yield tfd.Independent(tfd.Poisson(rate),
      reinterpreted_batch_ndims=1)
model = tfd.JointDistributionCoroutine(sts_with_poisson_likelihood_model)

In [None]:
## Preparation for inference
# We want to infer the unobserved quantities in the model, given the observed counts.
# First, we condition the joint log density on the observed counts.

# Condition a joint log-prob on the observed counts.
target_log_prob_fn = lambda *args: model.log_prob(args + (observed_counts,))

HMC and VI inference also like to operate over unconstrained real-valued spaces, so we'll construct the list of bijectors that constrains each of the parameters to their respective supports.

In [None]:
constraining_bijectors = ([param.bijector for param in sts_model.parameters] +
                           # `unconstrained_rate` is already unconstrained, but
                           # we can speed up inference by rescaling it.
                           [tfb.Scale(positive_bijector.inverse(
                               np.float32(np.max(observed_counts / 5.))))])

In [None]:
'''Inference with HMC

We'll use HMC (specifically, NUTS) to sample from the joint posterior over model parameters and latent rates.
This will be significantly slower than fitting a standard STS model with HMC, since in addition to the model's
 (relatively small number of) parameters we also have to infer the entire series of Poisson rates. 
 So we'll run for a relatively small number of steps; for applications where inference quality is critical 
 it might make sense to increase these values or to run multiple chains.
 '''

In [None]:
# Sampler configuration

# Allow external control of sampling to reduce test runtimes.
num_results = 100
num_results = int(num_results)

num_burnin_steps = 50
num_burnin_steps = int(num_burnin_steps)

First we specify a sampler, and then use `sample_chain` to run that sampling
kernel to produce samples.

In [None]:
sampler = tfp.mcmc.TransformedTransitionKernel(
    tfp.mcmc.NoUTurnSampler(
        target_log_prob_fn=target_log_prob_fn,
        step_size=0.1),
    bijector=constraining_bijectors)

adaptive_sampler = tfp.mcmc.DualAveragingStepSizeAdaptation(
    inner_kernel=sampler,
    num_adaptation_steps=int(0.8 * num_burnin_steps),
    target_accept_prob=0.75,
    # NUTS inside of a TTK requires custom getter/setter functions.
    step_size_setter_fn=lambda pkr, new_step_size: pkr._replace(
        inner_results=pkr.inner_results._replace(step_size=new_step_size)
        ),
    step_size_getter_fn=lambda pkr: pkr.inner_results.step_size,
    log_accept_prob_getter_fn=lambda pkr: pkr.inner_results.log_accept_ratio,
)

initial_state = [b.forward(tf.random.normal(part_shape))
                 for (b, part_shape) in zip(
                     constraining_bijectors, model.event_shape[:-1])]

In [None]:
# Speed up sampling by tracing with `tf.function`.
@tf.function(autograph=False, experimental_compile=True)
def do_sampling():
  return tfp.mcmc.sample_chain(
      kernel=adaptive_sampler,
      current_state=initial_state,
      num_results=num_results,
      num_burnin_steps=num_burnin_steps)

t0 = time.time()
samples, kernel_results = do_sampling()
t1 = time.time()
print("Inference ran in {:.2f}s.".format(t1-t0))

We can sanity-check the inference by examining the parameter traces. In this case they appear to have explored multiple explanations for the data, which is good, although more samples would be helpful to judge how well the chain is mixing.

In [None]:
f = plt.figure(figsize=(15, 5))
for i, param in enumerate(sts_model.parameters):
  ax = f.add_subplot(1, len(sts_model.parameters), i + 1)
  ax.plot(samples[i])
  ax.set_title("{} samples".format(param.name))

Now for the payoff: let's see the posterior over Poisson rates! We'll also plot the 80% predictive interval over observed counts, and can check that this interval appears to contain about 80% of the counts we actually observed.

In [None]:
param_samples = samples[:-1]
unconstrained_rate_samples = samples[-1][..., 0]
rate_samples = positive_bijector.forward(unconstrained_rate_samples)

plt.figure(figsize=(10, 4))
mean_lower, mean_upper = np.percentile(rate_samples, [10, 90], axis=0)
pred_lower, pred_upper = np.percentile(np.random.poisson(rate_samples), 
                                       [10, 90], axis=0)

_ = plt.plot(observed_counts, color="blue", ls='--', marker='o', label='observed', alpha=0.7)
_ = plt.plot(np.mean(rate_samples, axis=0), label='rate', color="green", ls='dashed', lw=2, alpha=0.7)
_ = plt.fill_between(np.arange(0, 30), mean_lower, mean_upper, color='green', alpha=0.2)
_ = plt.fill_between(np.arange(0, 30), pred_lower, pred_upper, color='grey', label='counts', alpha=0.2)
plt.xlabel("Day")
plt.ylabel("Daily Sample Size")
plt.title("Posterior Mean")
plt.legend()
plt.show()

In [None]:
## Forecasting
# To forecast the observed counts, we'll use the standard STS tools to build a forecast distribution
# over the latent rates (in unconstrained space, again since STS is designed to model real-valued data),
# then pass the sampled forecasts through a Poisson observation model:

def sample_forecasted_counts(sts_model, posterior_latent_rates,
                             posterior_params, num_steps_forecast,
                             num_sampled_forecasts):

  # Forecast the future latent unconstrained rates, given the inferred latent
  # unconstrained rates and parameters.
  unconstrained_rates_forecast_dist = tfp.sts.forecast(sts_model,
    observed_time_series=unconstrained_rate_samples,
    parameter_samples=posterior_params,
    num_steps_forecast=num_steps_forecast)

  # Transform the forecast to positive-valued Poisson rates.
  rates_forecast_dist = tfd.TransformedDistribution(
      unconstrained_rates_forecast_dist,
      positive_bijector)

  # Sample from the forecast model following the chain rule:
  # P(counts) = P(counts | latent_rates)P(latent_rates)
  sampled_latent_rates = rates_forecast_dist.sample(num_sampled_forecasts)
  sampled_forecast_counts = tfd.Poisson(rate=sampled_latent_rates).sample()

  return sampled_forecast_counts, sampled_latent_rates

forecast_samples, rate_samples = sample_forecasted_counts(
   sts_model,
   posterior_latent_rates=unconstrained_rate_samples,
   posterior_params=param_samples,
   # Days to forecast:
   num_steps_forecast=30,
   num_sampled_forecasts=100)

In [None]:
forecast_samples = np.squeeze(forecast_samples)

In [None]:
def plot_forecast_helper(data, forecast_samples, CI=90):
  """Plot the observed time series alongside the forecast."""
  plt.figure(figsize=(10, 4))
  forecast_median = np.median(forecast_samples, axis=0)

  num_steps = len(data)
  num_steps_forecast = forecast_median.shape[-1]

  plt.plot(np.arange(num_steps), data, lw=2, color='blue', linestyle='--', marker='o',
           label='Observed Data', alpha=0.7)

  forecast_steps = np.arange(num_steps, num_steps+num_steps_forecast)

  CI_interval = [(100 - CI)/2, 100 - (100 - CI)/2]
  lower, upper = np.percentile(forecast_samples, CI_interval, axis=0)

  plt.plot(forecast_steps, forecast_median, lw=2, ls='--', marker='o', color='orange',
           label=str(CI) + '% Forecast Interval', alpha=0.7)
  plt.fill_between(forecast_steps,
                   lower,
                   upper, color='orange', alpha=0.2)

  plt.xlim([0, num_steps+num_steps_forecast])
  ymin, ymax = min(np.min(forecast_samples), np.min(data)), max(np.max(forecast_samples), np.max(data))
  yrange = ymax-ymin
  plt.title("{}".format('Observed time series with ' + str(num_steps_forecast) + ' Day Forecast'))
  plt.xlabel('Day')
  plt.ylabel('Daily Sample Size')
  plt.legend()

In [None]:
plot_forecast_helper(observed_counts, forecast_samples, CI=80)

In [None]:
'''VI inference

Variational inference can be problematic when inferring a full time series, like our approximate counts 
(as opposed to just the *parameters* of a time series, as in standard STS models). 
The standard assumption that variables have independent posteriors is quite wrong, 
since each timestep is correlated with its neighbors, which can lead to underestimating uncertainty. 
For this reason, HMC may be a better choice for approximate inference over full time series. 
However, VI can be quite a bit faster, and may be useful for model prototyping or in cases 
where its performance can be empirically shown to be 'good enough'.

To fit our model with VI, we simply build and optimize a surrogate posterior:
'''

In [None]:
surrogate_posterior = tfp.experimental.vi.build_factored_surrogate_posterior(
    event_shape=model.event_shape[:-1],  # Infer everything but the observed counts.
    constraining_bijectors=constraining_bijectors)

In [None]:
# Allow external control of optimization to reduce test runtimes.
num_variational_steps = 200
num_variational_steps = int(num_variational_steps)

t0 = time.time()
losses = tfp.vi.fit_surrogate_posterior(target_log_prob_fn,
                                        surrogate_posterior,
                                        optimizer=tf.optimizers.Adam(0.1),
                                        num_steps=num_variational_steps)
t1 = time.time()
print("Inference ran in {:.2f}s.".format(t1-t0))

In [None]:
plt.plot(losses)
plt.title("Variational loss")
_ = plt.xlabel("Steps")

In [None]:
posterior_samples = surrogate_posterior.sample(50)
param_samples = posterior_samples[:-1]
unconstrained_rate_samples = posterior_samples[-1][..., 0]
rate_samples = positive_bijector.forward(unconstrained_rate_samples)

plt.figure(figsize=(10, 4))
mean_lower, mean_upper = np.percentile(rate_samples, [10, 90], axis=0)
pred_lower, pred_upper = np.percentile(
    np.random.poisson(rate_samples), [10, 90], axis=0)

_ = plt.plot(observed_counts, color='blue', ls='--', marker='o',
             label='observed', alpha=0.7)
_ = plt.plot(np.mean(rate_samples, axis=0), label='rate', color='green',
             ls='dashed', lw=2, alpha=0.7)
_ = plt.fill_between(
    np.arange(0, 30), mean_lower, mean_upper, color='green', alpha=0.2)
_ = plt.fill_between(np.arange(0, 30), pred_lower, pred_upper, color='grey',
    label='counts', alpha=0.2)
plt.xlabel('Day')
plt.ylabel('Daily Sample Size')
plt.title('Posterior Mean')
plt.legend()
plt.show()

In [None]:
forecast_samples, rate_samples = sample_forecasted_counts(
   sts_model,
   posterior_latent_rates=unconstrained_rate_samples,
   posterior_params=param_samples,
   # Days to forecast:
   num_steps_forecast=30,
   num_sampled_forecasts=100)

In [None]:
forecast_samples = np.squeeze(forecast_samples)

In [None]:
plot_forecast_helper(observed_counts, forecast_samples, CI=80)