# Correlation Complexity $\eta$ 
$\eta=\sum_{m=1}^{\infty}(m-1)k_m$  

with  

$k_m=\sum_{\sigma}p(\sigma)K[P_0;P]=\sum_{\sigma_m}p(\sigma_m) \sum_{x_m}p(x_m | \sigma_m)log(\frac{p(x_m | \sigma_m)}{p(x_m | \sigma_{m-1})})$  

$p(a|b) = \frac{p(a \& b)}{p(b)}$

In [27]:
import json
import numpy as np
from NDSparseTensor import NDSparseTensor

## Read a NDSparseMatrix

In [22]:
frequencies_loaded = {}
text_name = 'Pride and Prejudice'
for m in range(1, 10):
    with open(f'frequencies/{text_name} m{m}.json', 'r', encoding="utf8") as f:
        json_text = f.read()

        frequencies_loaded[m] = NDSparseTensor.fromjson(json_text)

with open(f'frequencies/{text_name} unique symbols.json', 'r', encoding="utf-8") as f:
    unique_symbols = json.load(f)

Total number of symbolsequences recorded, $\texttt{total}$

$P(\sigma_m)=\frac{\texttt{frequency of }\sigma_m}{\texttt{total}}$

In [369]:
total = {}
for m in range(1, 10):
    tot = 0
    for key, value in frequencies_loaded[m].getnonzero():
        tot += value
    total[m] = tot
print(total)

{1: 126221, 2: 126220, 3: 126219, 4: 126218, 5: 126217, 6: 126216, 7: 126215, 8: 126214, 9: 126213}


$p(x_m|\sigma_{m-1}) = \frac{p(\sigma_{m-1})}{p(\sigma'_{m-1})}$ with $\sigma_{m} = x_1x_2...x_m$ and $\sigma'_{m} = x_2x_3...x_m$

In [431]:
sequence_length = 100
m = 9
n_unique = len(unique_symbols)

def probability_distribution(m, previous_sequence):
    previous = tuple(previous_sequence)
    f = frequencies_loaded[m][(*previous, slice(None))].todense() / total[m]

    if all([x == 0 for x in f]): 
        f[:] = 1/total[m]
        print('No probabilities')

    return f / sum(f)

def select_symbol(m, previous_sequence=[]):
    if not m-1 == len(previous_sequence):
        raise ValueError(f'previous_sequence must be of length m-1 ({m-1}), given was ({len(previous_sequence)})')
    
    p_distribution = probability_distribution(m, previous_sequence)
    return np.random.choice(n_unique, p=p_distribution)

sequence = []
for i in range(1, sequence_length):
    mc = min(i, m)
    previous_sequence = sequence[-(m-1):] # Last m-1 indicies
    selected = select_symbol(mc, previous_sequence)
    sequence.append(selected)

' '.join([unique_symbols[i] for i in sequence])

'ELIZABETH WAS PLEASED THOUGH WHEN SHE ASKED HERSELF THE REASON SHE HAD VERY LITTLE TO SAY IN REPLY MR GARDINER LEFT THEM SOON AFTER BREAKFAST THE FISHING SCHEME HAD BEEN RENEWED THE DAY BEFORE AND A POSITIVE ENGAGEMENT MADE OF HIS MEETING SOME OF THE GENTLEMEN AT PEMBERLEY BEFORE NOON CHAPTER 45 CONVINCED AS ELIZABETH NOW WAS THAT MISS BINGLEY S DISLIKE OF HER HAD ORIGINATED IN JEALOUSY SHE COULD NOT HELP FEELING HOW UNWELCOME HER APPEARANCE AT PEMBERLEY MUST BE TO HER AND WAS CURIOUS TO KNOW WITH HOW MUCH CIVILITY ON THAT LADY S SIDE THE ACQUAINTANCE'