# Maximum Likelihood Estimation with Bernoulli Distribution

## Import modules

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import time
import glob

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython import display

import tensorflow as tf
tf.enable_eager_execution()

os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Setting hyperparameters

In [None]:
# Training Flags (hyperparameter configuration)
max_epochs = 10
batch_size = 128
learning_rate = 1e-1

## Make a toy dataset (Bernoulli distribution)

**Bernoulli distribution**

$X$ is a random variable
$$\Pr(X=1)=p=1-\Pr(X=0)=1-q$$

**Probability mass function**
$$f(k;p)={\begin{cases}p&{\text{if }}k=1,\\q=1-p&{\text{if }}k=0.\end{cases}}$$
or
$$f(k;p)=p^{k}(1-p)^{1-k} \qquad \qquad {\text{for }}k\in \{0,1\}$$

In [None]:
true_p = 0.7
N = 10000
train_data = np.random.binomial(n=1, p=true_p, size=N)
train_data = train_data.astype(np.float32)
train_data = np.expand_dims(train_data, axis=1)

## Set up dataset with `tf.data`

### create input pipeline with `tf.data.Dataset`

In [None]:
# for train
N = len(train_data)
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
train_dataset = train_dataset.shuffle(buffer_size=N)
train_dataset = train_dataset.batch(batch_size=batch_size, drop_remainder=True)
print(train_dataset)

## Create the parameters to learn

**Bernoulli distribution**
$$f(k;p)=p^{k}(1-p)^{1-k} \qquad \qquad {\text{for }}k\in \{0,1\}$$

**Variables**

* `logp`: $\log(p)$

In [None]:
logp = tf.Variable(-1.0) # initial value

In [None]:
def log_pmf(sample, logp):
  epsilon = 1e-7
  return tf.log(tf.pow(tf.math.exp(logp), sample) + epsilon) + tf.log(tf.pow(1.-tf.math.exp(logp), 1.-sample) + epsilon)

## Define the loss functions and the optimizer

In [None]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate)

## Training

In [None]:
print('Start Training.')
global_step = tf.train.get_or_create_global_step()
num_batches_per_epoch = int(N / batch_size)
loss_history = []

print("Epochs: {:.2f} global_step: {}  p: {:.3g}".format(
                0., 0, tf.math.exp(logp).numpy()))
for epoch in range(max_epochs):
  for step, images in enumerate(train_dataset):
    
    with tf.GradientTape() as tape:
      negative_log_likelihood = -tf.reduce_mean(log_pmf(images, logp))
      loss_history.append(negative_log_likelihood)
      
    gradients = tape.gradient(negative_log_likelihood, [logp])
    optimizer.apply_gradients(zip(gradients, [logp]), global_step=global_step)
    
    epochs = epoch + step / float(num_batches_per_epoch)
    if global_step.numpy() % 50 == 0:
      #display.clear_output(wait=True)
      print("Epochs: {:.2f} global_step: {} loss: {:.3g}  p: {:.3g}".format(
                epochs, global_step.numpy(), negative_log_likelihood.numpy(), tf.math.exp(logp).numpy()))
      
print('Training Done.')

## Print the results

In [None]:
print("Results")
print("p: {:.3g}".format(tf.math.exp(logp).numpy()))
print("true p: {:.3g}".format(true_p))