In [30]:
import abc
import enum
import uuid
import typing as t
import numpy as np
import pandas as pd
from rich import print


class Columns(str, enum.Enum):
    ID = "id"
    TIMESTAMP = "timestamp"


class BaseDataset:

    def __init__(self, seed: int | None = None, **kwargs):
        self.seed = seed
        if seed is not None:
            np.random.seed(seed)

    @abc.abstractmethod
    def generate(self) -> pd.DataFrame:
        """
        Generate pandas dataframe.
        """
        raise NotImplementedError

    def _generate_ids(self, sample_size: int) -> list[str]:
        return [str(uuid.uuid4()) for _ in range(sample_size)]


class TimeSeriesDataset(BaseDataset):
    """
    Engagement score dataset.
    """

    key_column = Columns.ID
    timestamp_column = Columns.TIMESTAMP

    def __init__(
        self, sample_size: int, feature_size: int, start_date: str, end_date: str, missing_ratio: float, seed: int = 42
    ):
        super().__init__(seed)
        self.sample_size = sample_size
        self.feature_size = feature_size
        self.start_date = start_date
        self.end_date = end_date
        self.missing_ratio = missing_ratio

    def generate(self) -> pd.DataFrame:
        """
        Generate time series dataset.
        The generated data type only contain numerical data for each column exclude key column.
        especially, the timestamp column is date type.
        """
        ids = self._generate_ids(self.sample_size)
        date_range = pd.date_range(start=self.start_date, end=self.end_date)
        data: list[dict[str, t.Any]] = [
            {
                **{self.key_column.value: id_, self.timestamp_column.value: timestamp},
                **{
                    f"feature_{idx}": np.nan if np.random.rand() < self.missing_ratio else np.random.randn()
                    for idx in range(1, self.feature_size + 1)
                },
            }
            for id_ in ids
            for timestamp in date_range
        ]
        return pd.DataFrame(data)


class CategoryDataset(BaseDataset):
    """
    Customer profile dataset.
    """

    key_column = Columns.ID

    def __init__(self, sample_size: int, feature_size: int, missing_ratio: float, seed: int = 42):
        super().__init__(seed)
        self.sample_size = sample_size
        self.feature_size = feature_size
        self.missing_ratio = missing_ratio

    def generate(self) -> pd.DataFrame:
        """
        Generate category dataset.
        The generated data type only contain catagorical data for each column exclude key column.
        """
        return pd.DataFrame()

In [31]:
sample_size = 100
feature_size = 10
start_date = "2020-01-01"
end_date = "2020-12-31"
missing_ratio = 0.1
time_series_dataset = TimeSeriesDataset(sample_size, feature_size, start_date, end_date, missing_ratio).generate()

In [32]:
len(time_series_dataset.id.unique())

100

In [33]:
len(time_series_dataset.timestamp.unique())

366