In [9]:
import numpy as np
import pandas as pd 

class DivergenceScore:
    def __init__(self,o=None,f=None,base=2,df=None,eps=None,N=None):
        """
        o, f    :   1-D series of observations and forecasts
        """        
        if o is not None:
            assert f is not None
            self.o = o
            self.f = f
            self.pks = sorted(np.unique(self.f))
            self.df = self.do_binning()
        elif df is not None:
            self.N = N
            self.df = df
            self.pks = np.array([self.df.loc["pk"]])
        
        self.eps = self._determine_eps(eps)
        self.base = base
        self.log = self._determine_log(self.base)
        self.ks = self.df.columns
        
    @staticmethod
    def _determine_log(base):
        if base == 2:
            return np.log2
        elif base == "e":
            return np.log 
        elif base == 10:
            return np.log10
        else:
            raise Exception("Choose log base from 2, 10, or e")
        
    def do_binning(self):
        df = pd.DataFrame(columns=[f"{k:.2f}" for k in ks], index=["pk", "ok", "nk"])
        for i, k in enumerate(self.ks):

            # Mask time series so only k 
            # Of these indices, how many obs were true? 
            # What is the length of observations?
            idx = np.nonzero
            f_subset = np.where(self.f==k, self.f, 0)
            o_subset = np.where(self.f==k, self.o, 0)
        self.N = len(self.o)
        return df
        
    @classmethod
    def compute_dkl(cls,q,p,base,eps=None):
        eps = cls._determine_eps(eps)
        log = cls._determine_log(base)
        dkl = q*log(q/(p+eps)) + (1-q)*log((1-q)/(1-p+eps))
        return dkl
    
    @staticmethod
    def _determine_eps(eps):
        if eps is None:
            return np.finfo(float).eps
        else:
            assert eps > np.finfo(float).eps
            return eps
    
    @classmethod
    def compute_ds_from_components(cls,rel=None,res=None,unc=None):
        assert rel and res and unc 
        return unc - res + rel 
    
    def compute_ds(self,from_components=False,return_all=False):
        if from_components:
            rel = self.compute_rel()
            res = self.compute_res()
            unc = self.compute_unc()
            if return_all:
                print("Note that return_all is ignored outputting components")
            return self.compute_ds_from_components(rel=rel,res=res,unc=unc)
        else:
            try:
                ds_series = self.compute_dkl(self.o,self.f, self.base)
            except AttributeError:
                # or "if self.o is None"
                print("You can only ask for DS computed from a time series.")
            if return_all:
                return ds_series
            return np.mean(ds_series)
        
    def compute_rel(self):
        total_dkl = 0
        for i, k in enumerate(self.ks):   
            ok_bar = self.df.loc["ok",k]/self.df.loc["nk",k]
            dkl = self.compute_dkl(ok_bar,self.df.loc["pk",k],self.base)
            total_dkl += dkl * self.df.loc["nk",k]
        return total_dkl/self.N
    
    def compute_res(self):
        total_dkl = 0
        for i, k in enumerate(self.ks):   
            ok_bar = self.df.loc["ok",k]/self.df.loc["nk",k]
            o_bar = np.sum(self.df.loc["ok"])/self.N
            dkl = self.compute_dkl(ok_bar,o_bar,base=self.base)
            total_dkl += dkl * self.df.loc["nk",k]
            # print(i,k,ok_bar,o_bar,dkl,total_dkl)
        return total_dkl/self.N
    
    def compute_unc(self):
        o_bar = self.eps + np.sum(self.df.loc["ok"])/self.N
        unc = -(o_bar * self.log(o_bar) + (1-o_bar) * self.log(1-o_bar)) 
        return unc 

        
        

In [10]:
N = 346
pks = [0.05,] + list(np.arange(0.1,0.95,0.1)) + [0.95,]
ks = np.arange(1,len(pks)+1)
ks_str = [f"{k:d}" for k in ks]
df = pd.DataFrame(columns=ks, index=["pk", "ok", "nk"])

# Populate the dataframe
# df.loc["k"] = ks
df.loc["pk"] = pks
df.loc["ok"] = [1, 1, 5, 5, 4, 8, 6, 16, 16, 8, 11]
df.loc["nk"] = [46, 55, 59, 41, 19, 22, 22, 34, 24, 11, 13]

# print(df)
ds = DivergenceScore(df=df,N=N,base="e")
rel = ds.compute_rel()
res = ds.compute_res()
unc = ds.compute_unc()
# ds_no = ds.compute_ds(from_components=False)
ds_comp = ds.compute_ds(from_components=True)
print(f"{rel=}\n{res=}\n{unc=}\n{ds_comp=}")

rel=0.0712252672993584
res=0.16834409203691228
unc=0.5441879501594111
ds_comp=0.4470691254218573


In [11]:
df.loc["ok",2]

1

In [12]:
ds.df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11
pk,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95
ok,1.0,1.0,5.0,5.0,4.0,8.0,6.0,16.0,16.0,8.0,11.0
nk,46.0,55.0,59.0,41.0,19.0,22.0,22.0,34.0,24.0,11.0,13.0


In [13]:
# TODO:
# Getting a time series of f,o into the format above
# Do sigmoid function to remove 1s and 0s 
# Allow binning of forecast probs into specified 
# Visualisations like performance, reliability diagram, etc 
# Surprise a 2x2 table (threshold) then you can compute deterministic stats
# Finish Brier Score 
# Get into a script

In [14]:
class BrierScore(DivergenceScore):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_RES(self):
        return 
    
    def compute_REL(self):
        return 
    
    def compute_UNC(self):
        return 
    
    def compute_SS(self):
        return 
    
    def compute_BS(self,with_components):
        return
        

In [15]:
class BregmanViz:
    def __init__(self,bs_rel=None,bs_res=False,bs_unc=False,bs=None,
                    ds_rel=None,ds_res=None,ds_unc=None,ds=None):
        self.bs_rel = bs_rel
        self.bs_res = bs_res 
        self.bs_unc = bs_unc 
        self.bs = bs 
        
        self.ds_rel = ds_rel
        self.ds_res = ds_res 
        self.ds_unc = ds_unc 
        self.ds = ds 
        
    def viz(self):
        return 