recover_cells_and_kzg_proofs & matrix refactor (ethereum#3788)

* Recover cells and proofs & matrix clean up * Fix table of contents * Update reference tests generator * Update test format * Remove unused imports * Fix some minor nits * Rename MatrixEntry's proof to kzg_proof * Move RowIndex & ColumnIndex to das-core
jtraglia · Jun 11, 2024 · 5ace424 · 5ace424
1 parent 5633417
commit 5ace424
Show file tree

Hide file tree

Showing 7 changed files with 255 additions and 164 deletions.
diff --git a/specs/_features/eip7594/das-core.md b/specs/_features/eip7594/das-core.md
@@ -17,6 +17,7 @@
   - [Custody setting](#custody-setting)
   - [Containers](#containers)
     - [`DataColumnSidecar`](#datacolumnsidecar)
+    - [`MatrixEntry`](#matrixentry)
   - [Helper functions](#helper-functions)
     - [`get_custody_columns`](#get_custody_columns)
     - [`compute_extended_matrix`](#compute_extended_matrix)
@@ -53,12 +54,10 @@ The following values are (non-configurable) constants used throughout the specif
 
 ## Custom types
 
-We define the following Python custom types for type hinting and readability:
-
 | Name | SSZ equivalent | Description |
 | - | - | - |
-| `DataColumn` | `List[Cell, MAX_BLOB_COMMITMENTS_PER_BLOCK]` | The data of each column in EIP-7594 |
-| `ExtendedMatrix` | `List[Cell, MAX_CELLS_IN_EXTENDED_MATRIX]` | The full data of one-dimensional erasure coding extended blobs (in row major format). |
+| `RowIndex` | `uint64` | Row identifier in the matrix of cells |
+| `ColumnIndex` | `uint64` | Column identifier in the matrix of cells |
 
 ## Configuration
 
@@ -79,7 +78,7 @@ We define the following Python custom types for type hinting and readability:
 
 | Name | Value | Description |
 | - | - | - |
-| `SAMPLES_PER_SLOT` | `8` | Number of `DataColumn` random samples a node queries per slot |
+| `SAMPLES_PER_SLOT` | `8` | Number of `DataColumnSidecar` random samples a node queries per slot |
 | `CUSTODY_REQUIREMENT` | `1` | Minimum number of subnets an honest node custodies and serves samples from |
 | `TARGET_NUMBER_OF_PEERS` | `70` | Suggested minimum peer count |
 
@@ -90,13 +89,23 @@ We define the following Python custom types for type hinting and readability:
 ```python
 class DataColumnSidecar(Container):
     index: ColumnIndex  # Index of column in extended matrix
-    column: DataColumn
+    column: List[Cell, MAX_BLOB_COMMITMENTS_PER_BLOCK]
     kzg_commitments: List[KZGCommitment, MAX_BLOB_COMMITMENTS_PER_BLOCK]
     kzg_proofs: List[KZGProof, MAX_BLOB_COMMITMENTS_PER_BLOCK]
     signed_block_header: SignedBeaconBlockHeader
     kzg_commitments_inclusion_proof: Vector[Bytes32, KZG_COMMITMENTS_INCLUSION_PROOF_DEPTH]
 ```
 
+#### `MatrixEntry`
+
+```python
+class MatrixEntry(Container):
+    cell: Cell
+    kzg_proof: KZGProof
+    column_index: ColumnIndex
+    row_index: RowIndex
+```
+
 ### Helper functions
 
 #### `get_custody_columns`
@@ -132,37 +141,52 @@ def get_custody_columns(node_id: NodeID, custody_subnet_count: uint64) -> Sequen
 #### `compute_extended_matrix`
 
 ```python
-def compute_extended_matrix(blobs: Sequence[Blob]) -> ExtendedMatrix:
+def compute_extended_matrix(blobs: Sequence[Blob]) -> List[MatrixEntry, MAX_CELLS_IN_EXTENDED_MATRIX]:
     """
     Return the full ``ExtendedMatrix``.
 
     This helper demonstrates the relationship between blobs and ``ExtendedMatrix``.
     The data structure for storing cells is implementation-dependent.
     """
     extended_matrix = []
-    for blob in blobs:
-        extended_matrix.extend(compute_cells(blob))
-    return ExtendedMatrix(extended_matrix)
+    for blob_index, blob in enumerate(blobs):
+        cells, proofs = compute_cells_and_kzg_proofs(blob)
+        for cell_id, (cell, proof) in enumerate(zip(cells, proofs)):
+            extended_matrix.append(MatrixEntry(
+                cell=cell,
+                kzg_proof=proof,
+                row_index=blob_index,
+                column_index=cell_id,
+            ))
+    return extended_matrix
 ```
 
 #### `recover_matrix`
 
 ```python
-def recover_matrix(cells_dict: Dict[Tuple[BlobIndex, CellID], Cell], blob_count: uint64) -> ExtendedMatrix:
+def recover_matrix(partial_matrix: Sequence[MatrixEntry],
+                   blob_count: uint64) -> List[MatrixEntry, MAX_CELLS_IN_EXTENDED_MATRIX]:
     """
-    Return the recovered ``ExtendedMatrix``.
+    Return the recovered extended matrix.
 
-    This helper demonstrates how to apply ``recover_all_cells``.
+    This helper demonstrates how to apply ``recover_cells_and_kzg_proofs``.
     The data structure for storing cells is implementation-dependent.
     """
-    extended_matrix: List[Cell] = []
+    extended_matrix = []
     for blob_index in range(blob_count):
-        cell_ids = [cell_id for b_index, cell_id in cells_dict.keys() if b_index == blob_index]
-        cells = [cells_dict[(BlobIndex(blob_index), cell_id)] for cell_id in cell_ids]
-
-        all_cells_for_row = recover_all_cells(cell_ids, cells)
-        extended_matrix.extend(all_cells_for_row)
-    return ExtendedMatrix(extended_matrix)
+        cell_ids = [e.column_index for e in partial_matrix if e.row_index == blob_index]
+        cells = [e.cell for e in partial_matrix if e.row_index == blob_index]
+        proofs = [e.kzg_proof for e in partial_matrix if e.row_index == blob_index]
+
+        recovered_cells, recovered_proofs = recover_cells_and_kzg_proofs(cell_ids, cells, proofs)
+        for cell_id, (cell, proof) in enumerate(zip(recovered_cells, recovered_proofs)):
+            extended_matrix.append(MatrixEntry(
+                cell=cell,
+                kzg_proof=proof,
+                row_index=blob_index,
+                column_index=cell_id,
+            ))
+    return extended_matrix
 ```
 
 #### `get_data_column_sidecars`
@@ -182,15 +206,15 @@ def get_data_column_sidecars(signed_block: SignedBeaconBlock,
     proofs = [cells_and_proofs[i][1] for i in range(blob_count)]
     sidecars = []
     for column_index in range(NUMBER_OF_COLUMNS):
-        column = DataColumn([cells[row_index][column_index]
-                             for row_index in range(blob_count)])
-        kzg_proof_of_column = [proofs[row_index][column_index]
-                               for row_index in range(blob_count)]
+        column_cells = [cells[row_index][column_index]
+                        for row_index in range(blob_count)]
+        column_proofs = [proofs[row_index][column_index]
+                         for row_index in range(blob_count)]
         sidecars.append(DataColumnSidecar(
             index=column_index,
-            column=column,
+            column=column_cells,
             kzg_commitments=block.body.blob_kzg_commitments,
-            kzg_proofs=kzg_proof_of_column,
+            kzg_proofs=column_proofs,
             signed_block_header=signed_block_header,
             kzg_commitments_inclusion_proof=kzg_commitments_inclusion_proof,
         ))
@@ -283,7 +307,7 @@ Such trailing techniques and their analysis will be valuable for any DAS constru
 
 ### Row (blob) custody
 
-In the one-dimension construction, a node samples the peers by requesting the whole `DataColumn`. In reconstruction, a node can reconstruct all the blobs by 50% of the columns. Note that nodes can still download the row via `blob_sidecar_{subnet_id}` subnets.
+In the one-dimension construction, a node samples the peers by requesting the whole `DataColumnSidecar`. In reconstruction, a node can reconstruct all the blobs by 50% of the columns. Note that nodes can still download the row via `blob_sidecar_{subnet_id}` subnets.
 
 The potential benefits of having row custody could include:
 

diff --git a/specs/_features/eip7594/polynomial-commitments-sampling.md b/specs/_features/eip7594/polynomial-commitments-sampling.md
@@ -1,4 +1,4 @@
-# EIP-7594 -- Polynomial Commitments
+# EIP-7594 -- Polynomial Commitments Sampling
 
 ## Table of contents
 
@@ -46,7 +46,7 @@
   - [`construct_vanishing_polynomial`](#construct_vanishing_polynomial)
   - [`recover_shifted_data`](#recover_shifted_data)
   - [`recover_original_data`](#recover_original_data)
-  - [`recover_all_cells`](#recover_all_cells)
+  - [`recover_cells_and_kzg_proofs`](#recover_cells_and_kzg_proofs)
 
 <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 <!-- /TOC -->
@@ -67,9 +67,7 @@ Public functions MUST accept raw bytes as input and perform the required cryptog
 | `Coset` | `Vector[BLSFieldElement, FIELD_ELEMENTS_PER_CELL]` | The evaluation domain of a cell |
 | `CosetEvals` | `Vector[BLSFieldElement, FIELD_ELEMENTS_PER_CELL]` | The internal representation of a cell (the evaluations over its Coset) |
 | `Cell` | `ByteVector[BYTES_PER_FIELD_ELEMENT * FIELD_ELEMENTS_PER_CELL]` | The unit of blob data that can come with its own KZG proof |
-| `CellID` | `uint64` | Cell identifier |
-| `RowIndex` | `uint64` | Row identifier |
-| `ColumnIndex` | `uint64` | Column identifier |
+| `CellID` | `uint64` | Validation: `x < CELLS_PER_EXT_BLOB` |
 
 ## Constants
 
@@ -660,32 +658,39 @@ def recover_original_data(eval_shifted_extended_evaluation: Sequence[BLSFieldEle
     return reconstructed_data
 ```
 
-### `recover_all_cells`
+### `recover_cells_and_kzg_proofs`
 
 ```python
-def recover_all_cells(cell_ids: Sequence[CellID], cells: Sequence[Cell]) -> Sequence[Cell]:
+def recover_cells_and_kzg_proofs(cell_ids: Sequence[CellID],
+                                 cells: Sequence[Cell],
+                                 proofs_bytes: Sequence[Bytes48]) -> Tuple[
+        Vector[Cell, CELLS_PER_EXT_BLOB],
+        Vector[KZGProof, CELLS_PER_EXT_BLOB]]:
     """
-    Recover all of the cells in the extended blob from FIELD_ELEMENTS_PER_EXT_BLOB evaluations, 
-    half of which can be missing.
-    This algorithm uses FFTs to recover cells faster than using Lagrange implementation, as can be seen here:
+    Given at least 50% of cells/proofs for a blob, recover all the cells/proofs.
+    This algorithm uses FFTs to recover cells faster than using Lagrange
+    implementation, as can be seen here:
     https://ethresear.ch/t/reed-solomon-erasure-code-recovery-in-n-log-2-n-time-with-ffts/3039
 
     A faster version thanks to Qi Zhou can be found here:
     https://github.com/ethereum/research/blob/51b530a53bd4147d123ab3e390a9d08605c2cdb8/polynomial_reconstruction/polynomial_reconstruction_danksharding.py
 
     Public method.
     """
-    assert len(cell_ids) == len(cells)
+    assert len(cell_ids) == len(cells) == len(proofs_bytes)
     # Check we have enough cells to be able to perform the reconstruction
     assert CELLS_PER_EXT_BLOB / 2 <= len(cell_ids) <= CELLS_PER_EXT_BLOB
     # Check for duplicates
     assert len(cell_ids) == len(set(cell_ids))
-    # Check that each cell is the correct length
-    for cell in cells:
-        assert len(cell) == BYTES_PER_CELL
     # Check that the cell ids are within bounds
     for cell_id in cell_ids:
         assert cell_id < CELLS_PER_EXT_BLOB
+    # Check that each cell is the correct length
+    for cell in cells:
+        assert len(cell) == BYTES_PER_CELL
+    # Check that each proof is the correct length
+    for proof_bytes in proofs_bytes:
+        assert len(proof_bytes) == BYTES_PER_PROOF
 
     # Get the extended domain
     roots_of_unity_extended = compute_roots_of_unity(FIELD_ELEMENTS_PER_EXT_BLOB)
@@ -716,9 +721,21 @@ def recover_all_cells(cell_ids: Sequence[CellID], cells: Sequence[Cell]) -> Sequ
         end = (cell_id + 1) * FIELD_ELEMENTS_PER_CELL
         assert reconstructed_data[start:end] == coset_evals
 
-    reconstructed_data_as_cells = [
+    recovered_cells = [
         coset_evals_to_cell(reconstructed_data[i * FIELD_ELEMENTS_PER_CELL:(i + 1) * FIELD_ELEMENTS_PER_CELL])
         for i in range(CELLS_PER_EXT_BLOB)]
+
+    polynomial_eval = reconstructed_data[:FIELD_ELEMENTS_PER_BLOB]
+    polynomial_coeff = polynomial_eval_to_coeff(polynomial_eval)
+    recovered_proofs = [None] * CELLS_PER_EXT_BLOB
+    for i, cell_id in enumerate(cell_ids):
+        recovered_proofs[cell_id] = bytes_to_kzg_proof(proofs_bytes[i])
+    for i in range(CELLS_PER_EXT_BLOB):
+        if recovered_proofs[i] is None:
+            coset = coset_for_cell(CellID(i))
+            proof, ys = compute_kzg_proof_multi_impl(polynomial_coeff, coset)
+            assert coset_evals_to_cell(ys) == recovered_cells[i]
+            recovered_proofs[i] = proof
 
-    return reconstructed_data_as_cells
+    return recovered_cells, recovered_proofs
 ```
diff --git a/tests/core/pyspec/eth2spec/test/eip7594/unittests/das/test_das.py b/tests/core/pyspec/eth2spec/test/eip7594/unittests/das/test_das.py
@@ -9,6 +9,11 @@
 )
 
 
+def chunks(lst, n):
+    """Helper that splits a list into N sized chunks."""
+    return [lst[i:i + n] for i in range(0, len(lst), n)]
+
+
 @with_eip7594_and_later
 @spec_test
 @single_phase
@@ -20,15 +25,15 @@ def test_compute_extended_matrix(spec):
     extended_matrix = spec.compute_extended_matrix(input_blobs)
     assert len(extended_matrix) == spec.CELLS_PER_EXT_BLOB * blob_count
 
-    rows = [extended_matrix[i:(i + spec.CELLS_PER_EXT_BLOB)]
-            for i in range(0, len(extended_matrix), spec.CELLS_PER_EXT_BLOB)]
+    rows = chunks(extended_matrix, spec.CELLS_PER_EXT_BLOB)
     assert len(rows) == blob_count
-    assert len(rows[0]) == spec.CELLS_PER_EXT_BLOB
+    for row in rows:
+        assert len(row) == spec.CELLS_PER_EXT_BLOB
 
     for blob_index, row in enumerate(rows):
         extended_blob = []
-        for cell in row:
-            extended_blob.extend(spec.cell_to_coset_evals(cell))
+        for entry in row:
+            extended_blob.extend(spec.cell_to_coset_evals(entry.cell))
         blob_part = extended_blob[0:len(extended_blob) // 2]
         blob = b''.join([spec.bls_field_to_bytes(x) for x in blob_part])
         assert blob == input_blobs[blob_index]
@@ -43,27 +48,19 @@ def test_recover_matrix(spec):
     # Number of samples we will be recovering from
     N_SAMPLES = spec.CELLS_PER_EXT_BLOB // 2
 
+    # Compute an extended matrix with two blobs
     blob_count = 2
-    cells_dict = {}
-    original_cells = []
-    for blob_index in range(blob_count):
-        # Get the data we will be working with
-        blob = get_sample_blob(spec, rng=rng)
-        # Extend data with Reed-Solomon and split the extended data in cells
-        cells = spec.compute_cells(blob)
-        original_cells.append(cells)
-        cell_ids = []
-        # First figure out just the indices of the cells
-        for _ in range(N_SAMPLES):
-            cell_id = rng.randint(0, spec.CELLS_PER_EXT_BLOB - 1)
-            while cell_id in cell_ids:
-                cell_id = rng.randint(0, spec.CELLS_PER_EXT_BLOB - 1)
-            cell_ids.append(cell_id)
-            cell = cells[cell_id]
-            cells_dict[(blob_index, cell_id)] = cell
-        assert len(cell_ids) == N_SAMPLES
+    blobs = [get_sample_blob(spec, rng=rng) for _ in range(blob_count)]
+    extended_matrix = spec.compute_extended_matrix(blobs)
+
+    # Construct a matrix with some entries missing
+    partial_matrix = []
+    for blob_entries in chunks(extended_matrix, spec.CELLS_PER_EXT_BLOB):
+        rng.shuffle(blob_entries)
+        partial_matrix.extend(blob_entries[:N_SAMPLES])
+
+    # Given the partial matrix, recover the missing entries
+    recovered_matrix = spec.recover_matrix(partial_matrix, blob_count)
 
-    # Recover the matrix
-    recovered_matrix = spec.recover_matrix(cells_dict, blob_count)
-    flatten_original_cells = [cell for cells in original_cells for cell in cells]
-    assert recovered_matrix == flatten_original_cells
+    # Ensure that the recovered matrix matches the original matrix
+    assert recovered_matrix == extended_matrix
diff --git a/...pec/eth2spec/test/eip7594/unittests/polynomial_commitments/test_polynomial_commitments.py b/...pec/eth2spec/test/eip7594/unittests/polynomial_commitments/test_polynomial_commitments.py
@@ -64,7 +64,7 @@ def test_verify_cell_kzg_proof_batch(spec):
 @with_eip7594_and_later
 @spec_test
 @single_phase
-def test_recover_all_cells(spec):
+def test_recover_cells_and_kzg_proofs(spec):
     rng = random.Random(5566)
 
     # Number of samples we will be recovering from
@@ -74,7 +74,7 @@ def test_recover_all_cells(spec):
     blob = get_sample_blob(spec)
 
     # Extend data with Reed-Solomon and split the extended data in cells
-    cells = spec.compute_cells(blob)
+    cells, proofs = spec.compute_cells_and_kzg_proofs(blob)
 
     # Compute the cells we will be recovering from
     cell_ids = []
@@ -84,19 +84,21 @@ def test_recover_all_cells(spec):
         while j in cell_ids:
             j = rng.randint(0, spec.CELLS_PER_EXT_BLOB - 1)
         cell_ids.append(j)
-    # Now the cells themselves
+    # Now the cells/proofs themselves
     known_cells = [cells[cell_id] for cell_id in cell_ids]
+    known_proofs = [proofs[cell_id] for cell_id in cell_ids]
 
-    # Recover all of the cells
-    recovered_cells = spec.recover_all_cells(cell_ids, known_cells)
+    # Recover the missing cells and proofs
+    recovered_cells, recovered_proofs = spec.recover_cells_and_kzg_proofs(cell_ids, known_cells, known_proofs)
     recovered_data = [x for xs in recovered_cells for x in xs]
 
     # Check that the original data match the non-extended portion of the recovered data
     blob_byte_array = [b for b in blob]
     assert blob_byte_array == recovered_data[:len(recovered_data) // 2]
 
-    # Check that the recovered cells match the original cells
+    # Check that the recovered cells/proofs match the original cells/proofs
     assert cells == recovered_cells
+    assert proofs == recovered_proofs
 
 
 @with_eip7594_and_later

diff --git a/tests/formats/kzg_7594/recover_all_cells.md b/tests/formats/kzg_7594/recover_all_cells.md