In [10]:
import os
import sys
from pathlib import Path



In [11]:
%pwd

'd:\\Time Series Forecasting\\Time-Series-Forecasting-with-XGBoost\\notebooks'

In [12]:
# Set up project paths
PROJECT_ROOT = Path.cwd().parents[0]
data_path = PROJECT_ROOT / 'data' / 'raw'

In [13]:
# Data validation 

import pandas as pd
import numpy as np
from typing import Any, Optional, Union, Generator, List
from pathlib import Path
from myapp.config.config_manager import ConfigManager
from myapp.utils.logger import CustomLogger
# from myapp.components.data_validation import DataValidator


In [None]:
from typing import Dict, Any, Optional
import pandas as pd


class CleaningRules:
    """
    Define cleaning rules and configurations for energy usage data.
    Similar to data_schema but focused on cleaning/repairing data.
    """

    # Columns expected to be cleaned and their data types
    expected_columns: Dict[str, str] = {
        "datetime": "datetime64[ns]",
        "aep_mw": "float64",
    }

    # Columns allowed to have nulls or not
    nullable_columns = set()  # e.g. {"aep_mw"} if missing allowed

    # Default fill values for missing data (if any)
    default_fill_values: Dict[str, Any] = {
        # e.g., "aep_mw": 0.0  # fill missing with 0 if appropriate
    }

    # Rules for invalid or erroneous values (min, max, etc.)
    value_constraints: Dict[str, Dict[str, Optional[float]]] = {
        "aep_mw": {"min": 0.0, "max": None},  # energy should be non-negative
    }

    @staticmethod
    def fix_dtypes(df: pd.DataFrame) -> pd.DataFrame:
        """
        Convert columns to expected data types.
        """
        for col, dtype in CleaningRules.expected_columns.items():
            if col in df.columns:
                try:
                    df[col] = df[col].astype(dtype)
                except Exception as e:
                    raise ValueError(f"Error converting column '{col}' to {dtype}: {e}")
        return df

    @staticmethod
    def fill_missing(df: pd.DataFrame) -> pd.DataFrame:
        """
        Fill missing values based on default_fill_values or drop if not nullable.
        """
        for col in CleaningRules.expected_columns.keys():
            if col in df.columns:
                if col in CleaningRules.nullable_columns:
                    # If nullable, leave as is or fill with default if specified
                    if col in CleaningRules.default_fill_values:
                        df[col].fillna(CleaningRules.default_fill_values[col], inplace=True)
                else:
                    # Not nullable, fill or drop rows
                    if col in CleaningRules.default_fill_values:
                        df[col].fillna(CleaningRules.default_fill_values[col], inplace=True)
                    else:
                        # Drop rows with missing in this column
                        df.dropna(subset=[col], inplace=True)
        return df

    @staticmethod
    def remove_invalid_values(df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove or fix rows with invalid values based on constraints.
        """
        for col, constraints in CleaningRules.value_constraints.items():
            if col in df.columns:
                min_val = constraints.get("min", None)
                max_val = constraints.get("max", None)
                if min_val is not None:
                    df = df[df[col] >= min_val]
                if max_val is not None:
                    df = df[df[col] <= max_val]
        return df

    @staticmethod
    def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
        """
        Remove duplicate rows based on 'datetime' column.
        """
        if 'datetime' in df.columns:
            df = df.drop_duplicates(subset=['datetime'])
        return df


In [None]:
import pandas as pd
from datetime import datetime

# Sample mock data with some missing and invalid values
data = {
    "datetime": [
        "2023-01-01 00:00:00", 
        "2023-01-01 01:00:00", 
        "2023-01-01 02:00:00", 
        None
    ],
    "aep_mw": [100.0, None, -5.0, 50.0]
}

df_mock = pd.DataFrame(data)
df_mock["datetime"] = pd.to_datetime(df_mock["datetime"])

df_mock


In [None]:
import pandas as pd
from myapp.preprocessing.cleaning_rules import CleaningRules
from myapp.preprocessing.data_preprocessing import DataPreprocessing

# Create mock data
df_mock = pd.DataFrame({
    "datetime": pd.to_datetime(["2023-01-01 00:00:00", "2023-01-01 01:00:00", None]),
    "aep_mw": [100.0, None, -10.0]
})

# Instantiate and run cleaning
preprocessor = DataPreprocessing(logger=None)
cleaned_df = preprocessor.clean_data(df_mock)

print(cleaned_df)


In [None]:
class CleaningRules:
    expected_columns: Dict[str, str] = {
        "datetime": "datetime64[ns]",
        "aep_mw": "float64",
    }

    nullable_columns = set()

    default_fill_values: Dict[str, Any] = {
        "aep_mw": 0.0  # example: fill missing energy with 0, adjust as needed
    }

    value_constraints: Dict[str, Dict[str, Optional[float]]] = {
        "aep_mw": {"min": 0.0, "max": None},
    }

    @staticmethod
    def validate_required_columns(df: pd.DataFrame) -> None:
        missing = [col for col in CleaningRules.expected_columns if col not in df.columns]
        if missing:
            raise ValueError(f"Missing required columns: {missing}")

    @staticmethod
    def fix_dtypes(df: pd.DataFrame) -> pd.DataFrame:
        for col, dtype in CleaningRules.expected_columns.items():
            if col in df.columns:
                try:
                    df[col] = df[col].astype(dtype)
                except Exception as e:
                    raise ValueError(f"Error converting column '{col}' to {dtype}: {e}")
        return df

    @staticmethod
    def has_missing(df: pd.DataFrame) -> bool:
        """Check if dataframe has any missing values in expected columns."""
        return df[CleaningRules.expected_columns.keys()].isnull().any().any()

    @staticmethod
    def fill_missing(df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        for col in CleaningRules.expected_columns.keys():
            if col in df_copy.columns and col in CleaningRules.default_fill_values:
                df_copy[col] = df_copy[col].fillna(CleaningRules.default_fill_values[col])
        return df_copy

    @staticmethod
    def remove_invalid_values(df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        for col, constraints in CleaningRules.value_constraints.items():
            if col in df_copy.columns:
                min_val = constraints.get("min")
                max_val = constraints.get("max")
                if min_val is not None:
                    df_copy = df_copy[df_copy[col] >= min_val]
                if max_val is not None:
                    df_copy = df_copy[df_copy[col] <= max_val]
        return df_copy
