# Content and Objective

+ Show principle and realization of Blahut-Arimoto algorithm
+ To this goal, channel transition probabilities are fixed, e.g., by randomly sampling them
+ Note: Transition matrix is given as $P=P(j|i)=P(Y=j|X=i)_ji$, so column represents input going to rows representing the possible outputs

# Import

In [1]:
# importing
import numpy as np

from scipy import optimize

import matplotlib.pyplot as plt
import matplotlib

from time import time

# showing figures inline
%matplotlib inline

In [2]:
# plotting options 
font = {'size'   : 20}
plt.rc('font', **font)
plt.rc('text', usetex=True)

matplotlib.rc('figure', figsize=(18, 6) )

# Here we go

## Sample channel matrix

In [3]:
# choose channel
# currently: 
#       BSC
#       random 
switch = 'random'  

if switch == 'bsc':
    X = np.arange( 2 ) 
    Y = np.arange( 2 )     
    
    delta = .2
    P_yx = np.array( [ [ 1-delta, delta], [ delta, 1-delta ] ] )

elif switch == 'bec':
    X = np.arange( 2 ) 
    Y = np.arange( 3 )

    delta = .2
    P_yx = np.array( [ [ 1- delta, 0 ], [ delta, delta ], [ 0, 1-delta ] ])

elif switch == 'Z':
    X = np.arange( 2 ) 
    Y = np.arange( 2 )     
    
    delta = .5
    P_yx = np.array( [ [ 1, delta], [ 0, 1-delta ] ] )

elif switch == 'random':
        # random channel
    X = np.arange(0, 20)
    Y = np.arange(0, 20)


    P_yx = np.random.rand( Y.size, X.size)
    for x in X:
        P_yx[ :, x ] /= np.sum( P_yx[ :, x ] )

elif switch == 'symmetrical':

    X = np.arange( 10 )
    Y = np.arange( 10 )

    prob = .5 * 1 / X.size
    P_yx = np.full((Y.size, X.size), prob)

    # Adjust diagonal elements to ensure row and column sums are 1
    for i in range(X.size):
        P_yx[i, i] = 1 - prob * (X.size - 1)

print( P_yx )


[[4.49020442e-02 5.35023127e-02 6.55453465e-02 5.49587565e-02
  4.25592319e-02 2.51130192e-02 4.67272380e-02 3.36813382e-02
  2.65561217e-02 6.61894623e-02 3.15025738e-03 4.23245429e-02
  8.88011859e-02 7.85792433e-03 1.73048698e-02 7.12103118e-03
  9.72520556e-03 6.20196589e-02 4.57265196e-02 8.78317545e-02]
 [1.20460494e-02 6.07103246e-02 4.36503883e-02 5.00884286e-02
  2.85292982e-02 1.88683476e-02 6.58269150e-02 7.88663937e-02
  4.38165654e-02 4.58570319e-02 3.47908155e-03 6.89005639e-02
  1.63381367e-02 4.49255433e-02 9.53352673e-02 3.79132156e-02
  7.55540284e-02 4.42885420e-02 7.80593716e-02 6.96420899e-02]
 [7.30075795e-02 5.79986597e-03 3.33756060e-02 1.36208645e-02
  3.59190057e-02 6.14961372e-02 7.37926416e-02 7.21569817e-02
  3.72779226e-02 4.65792798e-03 3.89847324e-02 7.06422947e-02
  7.69477032e-03 3.49210072e-02 4.90255845e-02 5.34022882e-02
  5.55069048e-02 2.30676868e-02 4.08633644e-02 8.56384560e-02]
 [3.03100238e-02 3.64250716e-02 8.38071502e-02 9.45001505e-02
  7.2

## Functions for determining mutual information, depending on P and p_X

In [4]:
# getting mutual information resulting from a given input distribution
def get_mutual_information( P, p_X ):
    '''
    Determining the mutual transinformation of a given channel and a given input distribution
    
    IN: P, channel transition matrix; cols are inputs, rows are outputs
        p_X, input distribution
        
    OUT: I, value of mutual transinformation
    '''
    
    # get matrix of "pointwise entropy" and determine  -(P pX)^T log2( (P pX) ) + 1^T ( P log2( P ) p_X )
    Pp_info = get_P_info( P @ p_X )
    I_1 = -np.sum( Pp_info )  

    P_info = get_P_info( P )
    I_2 =  np.sum( P_info @ p_X ) 
        
    I = I_1 + I_2
    
    return I


# determining information matrix
def get_P_info( P ):
    '''
    Determining information matrix/vector given by P log2( P ) when P equals input
    
    IN: P vector or matrix
    
    OUT: P_info: same shape and pointwise p log2( p )
    
    ''' 
    
    # transform to array and get shape
    P_array = np.array( P )
    P_shape = np.shape( P )
    
    # flatten array and assign info values by list comprehension
    P_flatten = P_array.flatten()
     
    P_info = np.array( [ p * np.log2( p ) if p > 0 else 0.0 for p in P_flatten ] )
    
    return P_info.reshape( P_shape )

## Optimization with differential evolution for comparison

In [5]:
# define differential_evolution
def diff_evo( func, n, p_cross = 0.9, step_size = 0.8, pop_size = 100, n_trials = 100 ):
    '''
    performing differential evolution
    description and standard values due to https://en.wikipedia.org/wiki/Differential_evolution
    
    IN: func: function to be minimized, 
        n: problem dimension n, 
        p_cross: cross-over probability, 
        step_size: step size , 
        pop_size: population size,
        n_trials: number of trials for minimization
        
    OUT: min_val: minimum value
        arg_min: argument of minimum
    '''
    
    agents = np.random.rand( pop_size, n )
    for _k in range( pop_size ):
        agents[ _k, : ] /= np.sum( agents[ _k, : ] )
    
    for _n in range( n_trials ):
        
        for ind_agent in range( pop_size ):
            
            # get parent agent
            x = agents[ ind_agent, : ]
            
            # select three other agents by first sampling three indices unequal to ind_agent and getting according values
            abc_ind = np.random.choice( 
                np.concatenate( (np.arange( 1, ind_agent ), np.arange( ind_agent + 1, pop_size)) ), 
                size = 3, replace = False )
            
            a = agents[ abc_ind[ 0 ], : ]
            b = agents[ abc_ind[ 1 ], : ]
            c = agents[ abc_ind[ 2 ], : ]

            # select dimension to be retained
            R = np.random.randint( n )
            
            # check whether coefficient should be retained
            retain = 1 * ( np.random.rand( n ) > p_cross )

            # get new value of y by altering only indices as determined before
            y = ( 1 - retain ) * ( a + step_size * ( b - c ) ) + retain * x
            #y = np.array( [ a[_k] + step_size * ( b[_k] - c[_k] ) if retain[_k]==1 else x[_k] for _k in range(n) ] )
            
            # avoid negative values and values greater 1
            if np.any( y<0 ) or np.any( y>1 ):
                continue

            y[ R ] = x[ R ]
            y /= np.sum( y )
            
            # check y against x
            if func( y ) <= func( x ):
                agents[ ind_agent, : ] = y
    
    # get minimum value and return min_value and arg min
    values = np.array( [ func( agents[ i, :] ) for i in range( pop_size ) ] )
    min_value = np.min( values )
    min_arg = agents[ np.argmin( values ), : ]
    
    return min_value, np.array( min_arg )

## Optimize using scipy optimize

In [6]:
# solving optimization problem for getting optimizing input distribution
def solve_for_p_X_scipy( P ):
    '''
    Determining input distribution maximizing mutual information
    
    IN: P, transition matrix of channel
    
    OUT: p_X, vector of capacity achieving probabilities
    '''
    
    N = np.shape(P)[1]
    x_0 = np.random.rand(N)
    x_0 /= np.sum(x_0)
    
    # getting channel capacity resulting from a given input distribution
    def get_I( p ):    
        
        # get matrix of "pointwise entropy" and determine  -(P pX)^T log2( (P pX) ) + 1^T ( P log2( P ) p_X )
        Pp_info = get_P_info( P @ p )
        I_1 = -np.sum( Pp_info )  

        P_info = get_P_info( P )
        I_2 =  np.sum( P_info @ p ) 

        I = I_1 + I_2

        return -I
    
    def sum_x_eq_1(p):
        return np.sum(p) - 1.0
    
    cons = {'type':'eq', 'fun': sum_x_eq_1}
    bds = optimize.Bounds( 0, 1, keep_feasible=1 )
    
    p_X = optimize.minimize( get_I, x_0, constraints=(cons), bounds = bds )

    return p_X

## Helper functions for Blahut-Arimoto

In [7]:
# getting Q given P and p_X
def get_Q_xy( P, p_X ):
    '''
        determines Q as provided by Blahut-Arimoto

        IN: P, p_X
        OUT: Q
    '''
    # init Q as |X| x |Y| matrix
    Q = np.zeros( np.shape(P)[::-1] )

    for x in X:
        for y in Y:
            Q_denom_y = (P @ p_X )[y]

            Q[ x, y ] = p_X[ x ] * P_yx[ y, x ] / Q_denom_y

    return Q

# getting p_X given P and Q
def get_p_X( P, Q ):

    # init p_X as |X| vector
    p_X = np.ones( np.shape( P ) )[1]

    # find denominator
    denom = 0
    for x in X:
        prod = 1
        for y in Y:
            prod *= Q[ x, y ]**P[ y, x ]
        denom += prod
    
    # get P(x)
    for x in X:
        prod = 1
        for y in Y:
            prod *= Q[ x, y ]**P[ y, x ]

        p_X[ x ] = prod / denom
    
    return p_X

    
#Q = get_Q_xy( P_yx, np.ones(X.size)/X.size )
#p_X = get_p_X( P_yx, Q)


## Actual algorithm of Blahut-Arimoto

In [8]:
# define Blahut-Arimoto Algorithm
def Blahut_Arimoto( P, max_iterations = 1e2 ):
    '''
        performs alg. of Blahut-Arimoto

        IN: P_yx
        OUT: p_X_max, C
    '''
    # initial distribution
    p_X = np.empty( ( 1, X.size ) )
    p_X[ 0, : ] = np.ones( X.size) / X.size
    
    i = 0

    # loop for a max. number of times (and stop if p has not changed)
    while i < max_iterations:

        # get Q
        Q = get_Q_xy( P, p_X[ i ])

        # get new p_X
        p_X_new = get_p_X( P_yx, Q )

        # append probabilities as row to p_X
        p_X = np.vstack( [ p_X, p_X_new ] )

        # increase counter
        i += 1

    return p_X


# Comparison of Blahut-Arimoto to differential evolution

In [9]:
# apply Blahut-Arimoto
p_X = Blahut_Arimoto( P_yx, 100 )

p_maximizer_BA = p_X[ -1, : ]
C_BA = get_mutual_information( P_yx, p_maximizer_BA )

In [10]:
# apply differential evolution
p_cross = 0.9
step_size = 0.8
pop_size = 30
n_trials = 100

# helper function to fix P
def func( p ):
    return - get_mutual_information( P_yx, p )
    
n = X.size

[ mv, ma ] = diff_evo( func, n, p_cross, step_size, pop_size, n_trials )

p_maximizer_de = ma
C_de = get_mutual_information( P_yx, p_maximizer_de )


In [11]:
# apply scipy optimization
p_maximizer_sp = solve_for_p_X_scipy( P_yx  ).x.reshape(X.size)

C_sp = get_mutual_information( P_yx, p_maximizer_sp )

In [12]:
if switch == 'bsc':
    val = 1 - ( - delta * np.log2( delta ) - ( 1-delta ) * np.log2( 1-delta ) )

    print('Theory:')
    print('-------------------------')
    print( f'Capacity is \t\tC = { val }') 
    print( f'Capacity achieved by \tp_X = { np.ones( X.size )/X.size }\n\n' )

if switch == 'bec':
    val = 1 - delta

    print('Theory:')
    print('-------------------------')
    print( f'Capacity is \t\tC = { val }') 
    print( f'Capacity achieved by \tp_X = { np.ones( X.size )/X.size }\n\n' )


print('Result of Blahut-Arimoto:')
print('-------------------------')
print( f'Capacity is \t\tC = { C_BA }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_BA }' )

print('\n\nResult of differential evolution:')
print('-------------------------')
print( f'Capacity: \t\tC = { C_de }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_de }' )

print('\n\nResult of scipy optimize:')
print('-------------------------')
print( f'Capacity: \t\tC = { C_sp }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_sp }' )

p_uni = np.ones_like( p_maximizer_de ) / len( p_maximizer_de )
C_uni = get_mutual_information( P_yx, p_uni )

print('\n\nResult when assuming uniform distribution:')
print('-------------------------')
print( f'Mutual information: \tC = { C_uni }') 
#print( f'Mutual information achieved by \tp_X = { p_uni }' )



Result of Blahut-Arimoto:
-------------------------
Capacity is 		C = 0.28886635134644933


Result of differential evolution:
-------------------------
Capacity: 		C = 0.2583514781402858


Result of scipy optimize:
-------------------------
Capacity: 		C = 0.2909899321582152


Result when assuming uniform distribution:
-------------------------
Mutual information: 	C = 0.23878784516174711
