# Dawid-Skene Algorithm using the shared diagonal of the confucion matrix



In [40]:

from types import MethodType

import numpy as np

# load data
from toy_data import N_CLASSES, N_WORKERS, votes

from peerannot.models import DawidSkene

In [None]:
class PooledDawidSkeneDiagonal(DawidSkene):
    """
    =============================
    Dawid and Skene model (1979)
    =============================

    Assumptions:
    - independent workers

    Using:
    - EM algorithm

    Estimating:
    - One diagonal matrix that is the same for each worker
    """

    def _m_step(self) -> None:
        """Maximizing log likelihood with only diagonal elements of pi."""
        self.rho = self.T.sum(0) / self.n_task

        diag_votes = np.einsum("tq, tiq -> q", self.T, self.crowd_matrix)
        denom = np.einsum("tq, tij -> q", self.T, self.crowd_matrix)

        self.pi = diag_votes/denom
        self.pi_non_diag_values = (np.ones_like(self.pi) -self.pi) / (self.n_classes - 1)


    def _e_step(self) -> None:
        """Estimate indicator variables (see eq. 2.5 Dawid and Skene 1979)

        Returns:
            T: New estimate for indicator variables (n_task, n_worker)
            denom: value used to compute likelihood easily
        """
        T = np.zeros((self.n_task, self.n_classes))

        for i in range(self.n_task):
            for j in range(self.n_classes):
                worker_labels = self.crowd_matrix[i]
                diag_contrib = np.prod(np.power(self.pi, worker_labels))
                mask = np.ones(self.n_classes, dtype=bool)
                mask[j] = False
                off_diag_contrib = np.prod(np.power(self.pi_non_diag_values[mask], worker_labels[:, mask]))

                T[i,j] = diag_contrib * off_diag_contrib * self.rho[j]

        self.denom_e_step = T.sum(1, keepdims=True)
        self.T = np.where(self.denom_e_step > 0, T/ self.denom_e_step, T)


In [42]:
pdsd = PooledDawidSkeneDiagonal(
    answers=votes, n_workers=N_WORKERS, n_classes=N_CLASSES,
)

pdsd.run(maxiter=100)
pdsd.get_answers()

[32m2025-03-28 15:32:58.247[0m | [34m[1mDEBUG   [0m | [36mpeerannot.models.aggregation.DS[0m:[36minit_crowd_matrix[0m:[36m105[0m - [34m[1mDense crowd matrix  5904[0m
[32m2025-03-28 15:32:58.248[0m | [34m[1mDEBUG   [0m | [36mpeerannot.models.aggregation.DS[0m:[36m__init__[0m:[36m83[0m - [34m[1mDense Crowd matrix5904[0m
[32m2025-03-28 15:32:58.248[0m | [34m[1mDEBUG   [0m | [36mpeerannot.models.aggregation.DS[0m:[36minit_T[0m:[36m112[0m - [34m[1mSize of T before calc: 1568[0m
[32m2025-03-28 15:32:58.249[0m | [34m[1mDEBUG   [0m | [36mpeerannot.models.aggregation.DS[0m:[36minit_T[0m:[36m116[0m - [34m[1mSize of T: 1568[0m


[A[A

Finished:  12%|█▏        | 12/100 [00:00<00:00, 310.62it/s]


array([2, 2, 8, 1, 4, 0, 3, 3, 7, 1, 0, 3, 7, 3, 4, 7, 4, 7, 3, 7])