# Entropy Features

### Loading Libraries

In [1]:
# Randomness
import random

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas import Timestamp

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline

# Date & Time
from datetime import datetime, timedelta

# Typing
from typing import Tuple, List, Dict, Union, Optional, Any, Generator

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection._split import _BaseKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss

# Scientific Statistical Python
import scipy.cluster.hierarchy as sch
from scipy.stats import jarque_bera
from scipy.stats import rv_continuous, kstest, norm

#### Probabilíty Mass Function

In [2]:
def pmf1(msg: Any, w: int) -> dict:
    lib = {}
    if not isinstance(msg, str):
        msg = ''.join(map(str, msg))
    for i in range(w, len(msg)):
        msg_ = msg[i - w: i]
        if msg_ not in lib:
            lib[msg_] = [i - w]
        else:
            lib[msg_] = lib[msg_] + [i - w]
    length = float(len(msg) - w)
    pmf = {i: len(lib[i]) / length for i in lib}
    return pmf

#### Maximun Likelihood Estimate Entropy Rate

In [3]:
def plug_in(msg: Any, w: int) -> Tuple[float, dict]:
    pmf = pmf1(msg, w)
    out = -sum([pmf[i] * np.log2(pmf[i]) for i in pmf]) / w
    return out, pmf

#### Lempev Algorithm

In [4]:
def lempel_ziv_lib(msg: str) -> list:
    i, lib = 1, [msg[0]]
    while i < len(msg):
        for j in range(i, len(msg)):
            msg_ = msg[i: j + 1]
            if msg_ not in lib:
                lib.append(msg_)
                break
        i = j + 1
    return lib

#### Length Longest Match

In [5]:
def match_length(msg: str, i: int, n: int) -> Tuple[int, str]:
    subS = ''
    for l in range(n):
        msg1 = msg[i: i + 1 + l]
        for j in range(i - n, i):
            msg0 = msg[j: j + 1 + l]
            if msg1 == msg0:
                subS = msg1
                break
    return len(subS) + 1, subS

#### Kontoyannis LZ Estimates

In [6]:
def konto(msg: Any, window: Optional[int] = None) -> dict:
    out = {'num': 0, 'sum': 0, 'subS': []}
    if not isinstance(msg, str):
        msg = ''.join(map(str, msg))
    if window is None:
        points = range(1, len(msg) // 2 + 1)
    else:
        window = min(window, len(msg) // 2)
        points = range(window, len(msg) - window + 1)
    for i in points:
        if window is None:
            l, msg_ = match_length(msg, i, i)
            out['sum'] += np.log2(i + 1) / l    
        else:
            l, msg_ = match_length(msg, i, window)
            out['sum'] += np.log2(window + 1) / l    
        out['subS'].append(msg_)
        out['num'] += 1
    out['h'] = out['sum'] / out['num']
    out['r'] = 1 - out['h'] / np.log2(len(msg))    
    return out