In [None]:
from sys import getsizeof

import numpy as np
import sparse as sp
from loguru import logger
from numpy.typing import NDArray

from peerannot.models.aggregation.DS import DawidSkene


class DawidSkeneSparse(DawidSkene):
    def _init_crowd_matrix(self) -> None:
        """Transform dictionnary of labels to a tensor of size
        (n_task, n_workers, n_classes)."""
        # TODO crowd matrix usually will be sparse, maybe there is another
        #  better implementation for it
        crowd_matrix = sp.DOK(
            (self.n_task, self.n_workers, self.n_classes),
            dtype=bool,
        )

        for task, ans in self.answers.items():
            for worker, label in ans.items():
                crowd_matrix[task, worker, label] = True

        self.crowd_matrix = crowd_matrix.to_coo()
        logger.debug(
            f"Size of sparse crowd matrix: {getsizeof(self.crowd_matrix)}",
        )

    def _init_T(self) -> None:
        """NS initialization"""
        # T shape is n_task, n_classes
        T = self.crowd_matrix.sum(axis=1)

        tdim = T.sum(1, keepdims=True).todense()
        self.T = np.where(tdim > 0, T / tdim, 0)

    def _m_step(
        self,
    ) -> None:
        """Maximizing log likelihood (see eq. 2.3 and 2.4 Dawid and Skene 1979)

        Returns:
            :math:`\\rho`: :math:`(\\rho_j)_j` probabilities that instance has true response j if drawn at random (class marginals)
            pi: number of times worker k records l when j is correct
        """
        # pi could be bigger, at least inner 2d matrices should be implemented as sparse, probably the easiest way to create is to use dok array

        self.rho = self.T.sum(axis=0) / self.n_task

        pi = sp.einsum("tq,twc->wqc", self.T, self.crowd_matrix)
        denom = pi.sum(axis=2, keepdims=True)
        self.pi = pi / sp.where(denom <= 0, -1e9, denom)


    def _e_step(self) -> None:
        """Estimate indicator variables (see eq. 2.5 Dawid and Skene 1979)"""

        exp_pi = np.power(self.pi[np.newaxis, :, :, :], self.crowd_matrix[:, :, np.newaxis, :])
        # numerator by taking the product over the worker axis
        num = np.prod(exp_pi, axis=3).prod(axis=1) * self.rho[np.newaxis, :]
        self.denom_e_step = num.sum(axis=1, keepdims=True).todense()
        self.T = np.where(self.denom_e_step > 0, num / self.denom_e_step, num)



    def get_answers(self) -> NDArray:
        """Get most probable labels"""

        return np.vectorize(self.inv_labels.get)(
            sp.argmax(self.T, axis=1).todense(),
        )


In [1]:
#%%
from toy_data import N_CLASSES, N_WORKERS, votes

dss = DawidSkeneSparse(answers=votes, n_workers=N_WORKERS, n_classes=N_CLASSES)
dss.run()

NameError: name 'DawidSkeneSparse' is not defined