In [1]:
import contextlib
from csv import DictReader
from functools import cached_property
from typing import Iterable, Callable


class logger:  # noqa
    # Fake logger
    log = staticmethod(print)  # noqa

In [2]:
import doctest
from abc import ABC, abstractmethod
from collections import defaultdict


class AbstractSerialMapper(ABC):

    def __init__(self):
        self._data = []

    @abstractmethod
    def add(self, value, *args, **kwargs):
        pass

    def __len__(self):
        return len(self._data)

    def __getitem__(self, key):
        with contextlib.suppress(IndexError):
            return self._data[key]

    def __iter__(self):
        """Iterates through the indices from 0 to len(self._data)"""
        return iter(range(len(self._data)))


class SerialUnidirectionalMapper(AbstractSerialMapper):
    def __init__(self):
        super().__init__()

    def add(self, data, *args, key=None, **kwargs):
        if key is None:
            self._data.append([data])
        else:
            self._data[key].append(data)


class SerialBidirectionalMapper(AbstractSerialMapper):

    def __init__(self):
        super().__init__()
        self._data = []
        self._inverse_data = defaultdict(lambda: None)

    def __getitem__(self, key):
        return self._data[key]

    def add(self, data, *args, **kwargs):
        self._inverse_data[data] = len(self._data)
        self._data.append(data)

    @property
    def inverse(self):
        return self._inverse_data


# This class comes in because I want to be able to generalize the system to any data later.
class InMemoryCsv2DIndexer(object):
    """
    An in-memory 2D indexer for efficiently store rows, columns and their related data from a CSV file.

    This class provides an efficient mechanism to map and store large datasets
    using bijective mappings between row/column names and their corresponding IDs.

    Attrs:
        LIMIT_TO_LOAD_IN_MEMORY (int): Maximum number of lines to load into memory.
        DEFAULT_CSV_DATA_INSTANTIATOR (callable): Default instantiator to use ...

    Args:
        row_name (str): The name representing the rows.
        column_name (str): The name representing the columns.

    Methods:
        index_from_csv(file_path, row_header, column_header, exclude_columns, instantiator, limit, verbose):
            Indexes data from a CSV file, mapping rows and columns by their headers.

    Example usage:
        >>> indexer2 = InMemoryCsv2DIndexer(row_name="userId", column_name="movieId")
        >>> # Testing with correct row and column headers
        >>> indexer2.index_from_csv(
        ...     file_path="./ml-32m/ratings.csv",
        ...     row_header="userId",
        ...     column_header="movieId",
        ...     limit=10
        ... )  # doctest:+ELLIPSIS
        Limit of entries (.i.e 10) to load has been reached. Exiting without loading the rest...
        <__main__.InMemoryCsv2DIndexer object at ...
        >>> # Testing with correct row and column headers
        >>> indexed_data2 = InMemoryCsv2DIndexer(row_name="userId", column_name="movieId")
        >>> indexed_data2 = indexer.index_from_csv(
        ...     file_path="./ml-32m/ratings.csv",
        ...     row_header="userId",
        ...     column_header="movieId",
        ...     limit=400
        ... )  # Expected to pass without error
        Limit of entries (.i.e 400) to load has been reached. Exiting without loading the rest...
        <__main__.InMemoryCsv2DIndexer object at ...
        >>> # Testing the content of the indexer
        >>> len(indexed_data._id_to_row_bmap._data) == 5
        True
    """

    LIMIT_TO_LOAD_IN_MEMORY = 1_000_000_000_000

    DEFAULT_CSV_DATA_INSTANTIATOR = lambda _cls, data, _columns: data
    # tuple(data[k] for k in (columns if columns else data))
    
    def create_from_csv(self):
        pass
    def __init__(
        self,
        *,
        row_name: str,
        column_name: str,
        file_path: str,
        data_columns: Iterable = None,
        data_constructor: Callable = None,
        limit: int = None,
        verbose: bool = False
    ):  # typing
        self._row_name = row_name
        self._column_name = column_name
        self._verbose = verbose
        self._data_columns = data_columns
        self._data_constructor = data_constructor
        self._limit = limit or self.LIMIT_TO_LOAD_IN_MEMORY
        

        self._matrix_value_constructor = float
        self._id_to_column_bmap = SerialBidirectionalMapper()  # bijective mapping
        self._id_to_row_bmap = SerialBidirectionalMapper()
        self._data_by_row_id = SerialUnidirectionalMapper()  # subjective mapping
        self._data_by_column_id = SerialUnidirectionalMapper()
        
        self.index_from_csv(
            file_path=file_path,
        )

    def index_from_csv(
        self,
        *,
        file_path: str,
        row_header: str,
        column_header: str,
        data_columns=None,
        instantiator=None,  # Must return a dict
        # data_constructor=None,
        limit=None,
        verbose=False,
    ):  # typing

        instantiator = instantiator or self.DEFAULT_CSV_DATA_INSTANTIATOR
        limit_to_load = limit or self.LIMIT_TO_LOAD_IN_MEMORY
        indexed_count = 0

        with open(file_path, mode="r", newline="") as csvfile:
            for line_count, line in enumerate(DictReader(csvfile)):
                row, column = line[row_header], line[column_header]
                # Unlikely in this dataset but better have this guard
                if not all([row, column]):
                    logger.log(
                        f"Cannot process the line {line_count}, cause expects `row` and `column` to be defined but got {row} and {column} for them respectively, skipping..."
                    )
                    continue

                row_id = self._id_to_row_bmap.inverse[row]
                column_id = self._id_to_column_bmap.inverse[column]

                if row_id is None:
                    # This row is a new one, so add it
                    self._id_to_row_bmap.add(row)

                if column_id is None:
                    # This column is a new one, so add it
                    self._id_to_column_bmap.add(column)

                # Add the data without the useless columns for indexing
                data = instantiator(line, data_columns)

                self._data_by_row_id.add(data=data, key=row_id)
                self._data_by_column_id.add(data=data, key=column_id)

                if verbose:
                    logger.log(
                        f"Indexed the line {indexed_count} of {file_path} successfully"
                    )

                indexed_count += 1
                if indexed_count == limit_to_load:
                    logger.log(
                        f"Limit of entries (.i.e {limit_to_load}) to load has been reached. Exiting without loading the rest... "
                    )
                    break
        self.is_indexed = True
        return self  # To enable method chaining (Fluent pattern)

    def find_data_by_row_id(self, row_id):
        return self._data_by_row_id[row_id]

    def find_data_by_column_id(self, column_id):
        return self._data_by_column_id[column_id]

    @cached_property
    def matrix(self):  # typing
        if not self.is_indexed:
            raise AttributeError(
                "Cannot define matrix if the indexer is not indexed, ensure you called `index_from_csv()`"
            )
        matrix = [
            [None] * indexed_data.columns_count for _ in range(indexed_data.rows_count)
        ]
        for i in range(self.rows_count):
            for data in self._data_by_row_id[i]:
                # FIXME: Depends on the instantiator
                column = data[self._column_name]
                value = data[self._matrix_value_column_name]
                column_id = self._id_to_column_bmap.inverse[column]
                matrix[i][column_id] = self._matrix_value_constructor(value)
        return matrix

    @property
    def rows_count(self) -> int:
        return len(self._id_to_row_bmap)

    @property
    def columns_count(self) -> int:
        return len(self._id_to_column_bmap)

    @cached_property
    def row_id_degree_map(self):
        return {
            row_id: len(self._data_by_row_id[row_id]) for row_id in self._id_to_row_bmap
        }

    @cached_property
    def column_id_degree_map(self):
        return {
            column_id: len(self._data_by_column_id[column_id])
            for column_id in self._id_to_column_bmap
        }

    def _index_data(self, file_path):
        pass

    def _index_from_csv(self, file_path):
        pass


# FIXME: Replace it by proper unit tests at the end
# doctest.testmod() # Skip this tests for the moment

indexed_data = InMemoryCsv2DIndexer(
    row_name="userId",
    column_name="movieId",
    matrix_value_column_name="rating",
).index_from_csv(
    file_path="./ml-32m/ratings.csv",
    row_header="userId",
    column_header="movieId",
    data_columns=("rating",),
    limit=400,
)

# doctest.testmod()
# TODO: Implement this methods, if the need shows up
# indexed_data.find_row_by_row_id()
# indexed_data.find_column_by_colum_id()
# indexed_data.find_column_by_id()
indexed_data._id_to_row_bmap._data
# indexed_data.row_id_degree_map

Limit of entries (.i.e 400) to load has been reached. Exiting without loading the rest... 


['1', '2', '3', '4', '5']

In [3]:
# From here, we will use the domain vocabulary
data_matrix = indexed_data.matrix
data_np_matrix = indexed_data.numpy_matrix
print(indexed_data.rows_count, indexed_data.columns_count)
# indexed_data.find_data_by_column_id(400)
# squared = indexed_data.numpy_matrix ** 2

5 327


In [4]:
indexed_data._data_by_row_id._data

[[{'userId': '1', 'movieId': '17', 'rating': '4.0', 'timestamp': '944249077'},
  {'userId': '1', 'movieId': '25', 'rating': '1.0', 'timestamp': '944250228'},
  {'userId': '1', 'movieId': '29', 'rating': '2.0', 'timestamp': '943230976'},
  {'userId': '1', 'movieId': '30', 'rating': '5.0', 'timestamp': '944249077'},
  {'userId': '1', 'movieId': '32', 'rating': '5.0', 'timestamp': '943228858'},
  {'userId': '1', 'movieId': '34', 'rating': '2.0', 'timestamp': '943228491'},
  {'userId': '1', 'movieId': '36', 'rating': '1.0', 'timestamp': '944249008'},
  {'userId': '1', 'movieId': '80', 'rating': '5.0', 'timestamp': '944248943'},
  {'userId': '1', 'movieId': '110', 'rating': '3.0', 'timestamp': '943231119'},
  {'userId': '1', 'movieId': '111', 'rating': '5.0', 'timestamp': '944249008'},
  {'userId': '1', 'movieId': '161', 'rating': '1.0', 'timestamp': '943231162'},
  {'userId': '1', 'movieId': '166', 'rating': '5.0', 'timestamp': '943228442'},
  {'userId': '1', 'movieId': '176', 'rating': '4