# Content and Objective

+ Show principle and realization of Blahut-Arimoto algorithm
+ To this goal, channel transition probabilities are fixed, e.g., by randomly sampling them
+ Note: Transition matrix is given as $P=P(j|i)=P(Y=j|X=i)_ji$, so column represents input going to rows representing the possible outputs

# Import

In [3]:
# importing
import numpy as np

from scipy import optimize

import matplotlib.pyplot as plt
import matplotlib

from time import time

# showing figures inline
%matplotlib inline

In [4]:
# plotting options 
font = {'size'   : 20}
plt.rc('font', **font)
plt.rc('text', usetex=True)

matplotlib.rc('figure', figsize=(18, 6) )

# Here we go

## Sample channel matrix

In [5]:
# choose channel
# currently: 
#       BSC
#       random 
switch = 'deterministic'  

if switch == 'bsc':
    X = np.arange( 2 ) 
    Y = np.arange( 2 )     
    
    delta = .2
    P_yx = np.array( [ [ 1-delta, delta], [ delta, 1-delta ] ] )

elif switch == 'bec':
    X = np.arange( 2 ) 
    Y = np.arange( 3 )

    delta = .2
    P_yx = np.array( [ [ 1- delta, 0 ], [ delta, delta ], [ 0, 1-delta ] ])

elif switch == 'Z':
    X = np.arange( 2 ) 
    Y = np.arange( 2 )     
    
    delta = .2
    P_yx = np.array( [ [ 1, delta], [ 0, 1-delta ] ] )


elif switch == 'random':
        # random channel
    X = np.arange(0, 300)
    Y = np.arange(0, 50)


    P_yx = np.random.rand( Y.size, X.size)
    for x in X:
        P_yx[ :, x ] /= np.sum( P_yx[ :, x ] )

elif switch == 'deterministic':
    X = np.arange(3)
    Y = np.arange(3)
    P_yx = np.array( [ [ 0.7, 0.2, 0.1 ], [ 0.2, 0.1, 0.7 ], [ 0.1, 0.7, 0.2 ] ] )


print( P_yx )

[[0.7 0.2 0.1]
 [0.2 0.1 0.7]
 [0.1 0.7 0.2]]


## Functions for determining mutual information, depending on P and p_X

In [6]:
# getting mutual information resulting from a given input distribution
def get_mutual_information( P, p_X ):
    '''
    Determining the mutual transinformation of a given channel and a given input distribution
    
    IN: P, channel transition matrix; cols are inputs, rows are outputs
        p_X, input distribution
        
    OUT: I, value of mutual transinformation
    '''
    
    # get matrix of "pointwise entropy" and determine  -(P pX)^T log2( (P pX) ) + 1^T ( P log2( P ) p_X )
    Pp_info = get_P_info( P @ p_X )
    I_1 = -np.sum( Pp_info )  

    P_info = get_P_info( P )
    I_2 =  np.sum( P_info @ p_X ) 
        
    I = I_1 + I_2
    
    return I


# determining information matrix
def get_P_info( P ):
    '''
    Determining information matrix/vector given by P log2( P ) when P equals input
    
    IN: P vector or matrix
    
    OUT: P_info: same shape and pointwise p log2( p )
    
    ''' 
    
    # transform to array and get shape
    P_array = np.array( P )
    P_shape = np.shape( P )
    
    # flatten array and assign info values by list comprehension
    P_flatten = P_array.flatten()
     
    P_info = np.array( [ p * np.log2( p ) if p > 0 else 0.0 for p in P_flatten ] )
    
    return P_info.reshape( P_shape )

## Optimization with differential evolution for comparison

In [7]:
# define differential_evolution
def diff_evo( func, n, p_cross = 0.9, step_size = 0.8, pop_size = 100, n_trials = 100 ):
    '''
    performing differential evolution
    description and standard values due to https://en.wikipedia.org/wiki/Differential_evolution
    
    IN: func: function to be minimized, 
        n: problem dimension n, 
        p_cross: cross-over probability, 
        step_size: step size , 
        pop_size: population size,
        n_trials: number of trials for minimization
        
    OUT: min_val: minimum value
        arg_min: argument of minimum
    '''
    
    agents = np.random.rand( pop_size, n )
    for _k in range( pop_size ):
        agents[ _k, : ] /= np.sum( agents[ _k, : ] )
    
    for _n in range( n_trials ):
        
        for ind_agent in range( pop_size ):
            
            # get parent agent
            x = agents[ ind_agent, : ]
            
            # select three other agents by first sampling three indices unequal to ind_agent and getting according values
            abc_ind = np.random.choice( 
                np.concatenate( (np.arange( 1, ind_agent ), np.arange( ind_agent + 1, pop_size)) ), 
                size = 3, replace = False )
            
            a = agents[ abc_ind[ 0 ], : ]
            b = agents[ abc_ind[ 1 ], : ]
            c = agents[ abc_ind[ 2 ], : ]

            # select dimension to be retained
            R = np.random.randint( n )
            
            # check whether coefficient should be retained
            retain = 1 * ( np.random.rand( n ) > p_cross )

            # get new value of y by altering only indices as determined before
            y = ( 1 - retain ) * ( a + step_size * ( b - c ) ) + retain * x
            #y = np.array( [ a[_k] + step_size * ( b[_k] - c[_k] ) if retain[_k]==1 else x[_k] for _k in range(n) ] )
            
            # avoid negative values and values greater 1
            if np.any( y<0 ) or np.any( y>1 ):
                continue

            y[ R ] = x[ R ]
            y /= np.sum( y )
            
            # check y against x
            if func( y ) <= func( x ):
                agents[ ind_agent, : ] = y
    
    # get minimum value and return min_value and arg min
    values = np.array( [ func( agents[ i, :] ) for i in range( pop_size ) ] )
    min_value = np.min( values )
    min_arg = agents[ np.argmin( values ), : ]
    
    return min_value, np.array( min_arg )

## Optimize using scipy optimize

In [8]:
# solving optimization problem for getting optimizing input distribution
def solve_for_p_X_scipy( P ):
    '''
    Determining input distribution maximizing mutual information
    
    IN: P, transition matrix of channel
    
    OUT: p_X, vector of capacity achieving probabilities
    '''
    
    N = np.shape(P)[1]
    x_0 = np.random.rand(N)
    x_0 /= np.sum(x_0)
    
    # getting channel capacity resulting from a given input distribution
    def get_I( p ):    
        
        # get matrix of "pointwise entropy" and determine  -(P pX)^T log2( (P pX) ) + 1^T ( P log2( P ) p_X )
        Pp_info = get_P_info( P @ p )
        I_1 = -np.sum( Pp_info )  

        P_info = get_P_info( P )
        I_2 =  np.sum( P_info @ p ) 

        I = I_1 + I_2

        return -I
    
    def sum_x_eq_1(p):
        return np.sum(p) - 1.0
    
    cons = {'type':'eq', 'fun': sum_x_eq_1}
    bds = optimize.Bounds( 0, 1, keep_feasible=1 )
    
    p_X = optimize.minimize( get_I, x_0, constraints=(cons), bounds = bds )

    return p_X

## Helper functions for Blahut-Arimoto

In [9]:
# getting Q given P and p_X
def get_Q_xy( P, p_X ):
    '''
        determines Q as provided by Blahut-Arimoto

        IN: P, p_X
        OUT: Q
    '''
    # init Q as |X| x |Y| matrix
    Q = np.zeros( np.shape(P)[::-1] )

    for x in X:
        for y in Y:
            Q_denom_y = (P @ p_X )[y]

            Q[ x, y ] = p_X[ x ] * P_yx[ y, x ] / Q_denom_y

    return Q

# getting p_X given P and Q
def get_p_X( P, Q ):

    # init p_X as |X| vector
    p_X = np.ones( np.shape( P ) )[1]

    # find denominator
    denom = 0
    for x in X:
        prod = 1
        for y in Y:
            prod *= Q[ x, y ]**P[ y, x ]
        denom += prod
    
    # get P(x)
    for x in X:
        prod = 1
        for y in Y:
            prod *= Q[ x, y ]**P[ y, x ]

        p_X[ x ] = prod / denom
    
    return p_X

    
#Q = get_Q_xy( P_yx, np.ones(X.size)/X.size )
#p_X = get_p_X( P_yx, Q)


## Actual algorithm of Blahut-Arimoto

In [34]:
# define Blahut-Arimoto Algorithm
def Blahut_Arimoto( P, max_iterations = 1e2 ):
    '''
        performs alg. of Blahut-Arimoto

        IN: P_yx
        OUT: p_X_max, C
    '''
    # initial distribution
    #p_X = np.empty( ( 1, X.size ) )
    #p_X[ 0, : ] = np.ones( X.size) / X.size
    # p_X = np.zeros( ( 1, X.size ) )
    # p_X[ 0, 0 ] = 1
    p_X = 0.1* np.ones( ( 1, X.size ) )
    p_X[ 0, 0 ] = 1.0 - (X.size -1)* 0.1
    
    i = 0

    # loop for a max. number of times (and stop if p has not changed)
    while i < max_iterations:

        # get Q
        Q = get_Q_xy( P, p_X[ i ])

        print(Q)

        # get new p_X
        p_X_new = get_p_X( P_yx, Q )

        # append probabilities as row to p_X
        p_X = np.vstack( [ p_X, p_X_new ] )
        print(p_X[i])

        # increase counter
        i += 1

    return p_X


# Comparison of Blahut-Arimoto to differential evolution

In [35]:
# apply Blahut-Arimoto
p_X = Blahut_Arimoto( P_yx, 50 )

p_maximizer_BA = p_X[ -1, : ]
C_BA = get_mutual_information( P_yx, p_maximizer_BA )

[[0.94915254 0.66666667 0.47058824]
 [0.03389831 0.04166667 0.41176471]
 [0.01694915 0.29166667 0.11764706]]
[0.8 0.1 0.1]
[[0.9086146  0.52704465 0.31936732]
 [0.0625748  0.06351923 0.53886045]
 [0.0288106  0.40943613 0.14177223]]
[0.6835283  0.16475716 0.15171454]
[[0.86239823 0.41599258 0.22778661]
 [0.09496844 0.0801669  0.61456205]
 [0.04263333 0.50384052 0.15765134]]
[0.57754222 0.22259902 0.19985876]
[[0.82006855 0.34037689 0.1767302 ]
 [0.12391619 0.09000695 0.6542666 ]
 [0.05601525 0.56961616 0.1690032 ]]
[0.49825477 0.26351019 0.23823504]
[[0.78611691 0.29174132 0.14781689]
 [0.14644766 0.09511112 0.67466074]
 [0.06743543 0.61314757 0.17752237]]
[0.44395032 0.2894661  0.26658358]
[[0.76074416 0.26065205 0.13071621]
 [0.16280848 0.09761973 0.68538393]
 [0.07644737 0.64172822 0.18389986]]
[0.40775149 0.30542318 0.28682533]
[[0.74245867 0.2405462  0.12017112]
 [0.17431507 0.09883237 0.69123995]
 [0.08322626 0.66062143 0.18858893]]
[0.38367078 0.31527492 0.3010543 ]
[[0.729527   

In [27]:
# apply differential evolution
p_cross = 0.9
step_size = 0.8
pop_size = 30
n_trials = 100

# helper function to fix P
def func( p ):
    return - get_mutual_information( P_yx, p )
    
n = X.size

[ mv, ma ] = diff_evo( func, n, p_cross, step_size, pop_size, n_trials )

p_maximizer_de = ma
C_de = get_mutual_information( P_yx, p_maximizer_de )


In [28]:
# apply scipy optimization
p_maximizer_sp = solve_for_p_X_scipy( P_yx  ).x.reshape(X.size)

C_sp = get_mutual_information( P_yx, p_maximizer_sp )

In [29]:
if switch == 'bsc':
    val = 1 - ( - delta * np.log2( delta ) - ( 1-delta ) * np.log2( 1-delta ) )

    print('Theory:')
    print('-------------------------')
    print( f'Capacity is \t\tC = { val }') 
    print( f'Capacity achieved by \tp_X = { np.ones( X.size )/X.size }\n\n' )

if switch == 'bec':
    val = 1 - delta

    print('Theory:')
    print('-------------------------')
    print( f'Capacity is \t\tC = { val }') 
    print( f'Capacity achieved by \tp_X = { np.ones( X.size )/X.size }\n\n' )


print('Result of Blahut-Arimoto:')
print('-------------------------')
print( f'Capacity is \t\tC = { C_BA }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_BA }' )

print('\n\nResult of differential evolution:')
print('-------------------------')
print( f'Capacity is \t\tC = { C_de }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_de }' )

print('\n\nResult of scipy optimize:')
print('-------------------------')
print( f'Capacity is \t\tC = { C_sp }') 
#print( f'Capacity achieved by \tp_X = { p_maximizer_sp }' )

p_uni = np.ones_like( p_maximizer_de ) / len( p_maximizer_de )
C_uni = get_mutual_information( P_yx, p_uni )

print('\n\nResult when assuming uniform distribution:')
print('-------------------------')
print( f'Mutual information is \t\tC = { C_uni }') 
#print( f'Mutual information achieved by \tp_X = { p_uni }' )



Result of Blahut-Arimoto:
-------------------------
Capacity is 		C = 0.4281828512741166


Result of differential evolution:
-------------------------
Capacity is 		C = 0.42818285127411726


Result of scipy optimize:
-------------------------
Capacity is 		C = 0.4281825830138868


Result when assuming uniform distribution:
-------------------------
Mutual information is 		C = 0.4281828512741166
