In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class EDAAggregator:
    def __init__(self, data, target_col):
        """
        Initialize the EDAAggregator class.
        :param data: Daily level dataset as a DataFrame.
        :param target_col: Target column for EDA ('volume' or 'value').
        """
        self.data = data
        self.target_col = target_col
        self.data['pay_date'] = pd.to_datetime(self.data['pay_date'])
        self.data['month'] = self.data['pay_date'].dt.month
        self.data['year'] = self.data['pay_date'].dt.year
        self.data['quarter'] = self.data['pay_date'].dt.quarter
        self.data['day_of_week'] = self.data['pay_date'].dt.dayofweek

    def aggregate_to_monthly(self):
        """Aggregate daily data to monthly level."""
        monthly_data = self.data.groupby(['year', 'month']).agg({
            self.target_col: ['sum', 'mean', 'std']
        }).reset_index()
        monthly_data.columns = ['year', 'month', f'{self.target_col}_sum', f'{self.target_col}_mean', f'{self.target_col}_std']
        return monthly_data

    def check_anomalies(self, threshold=3):
        """
        Check for anomalies in the target column using z-scores.
        :param threshold: Threshold for identifying anomalies (default is 3).
        """
        self.data['z_score'] = (self.data[self.target_col] - self.data[self.target_col].mean()) / self.data[self.target_col].std()
        anomalies = self.data[np.abs(self.data['z_score']) > threshold]
        return anomalies

    def analyze_holiday_trends(self, holiday_dates):
        """
        Analyze trends around holidays.
        :param holiday_dates: List of holiday dates as strings (e.g., ['2024-01-01', '2024-12-25']).
        """
        holiday_dates = pd.to_datetime(holiday_dates)
        self.data['is_holiday'] = self.data['pay_date'].isin(holiday_dates)
        holiday_trends = self.data.groupby('is_holiday')[self.target_col].mean().reset_index()
        return holiday_trends

    def visualize_trends(self):
        """Visualize trends over months, years, and quarters."""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # Monthly trend
        sns.lineplot(data=self.data, x='month', y=self.target_col, estimator='mean', ci=None, ax=axes[0, 0])
        axes[0, 0].set_title('Monthly Trend')

        # Yearly trend
        sns.lineplot(data=self.data, x='year', y=self.target_col, estimator='mean', ci=None, ax=axes[0, 1])
        axes[0, 1].set_title('Yearly Trend')

        # Quarterly trend
        sns.boxplot(data=self.data, x='quarter', y=self.target_col, ax=axes[1, 0])
        axes[1, 0].set_title('Quarterly Trend')

        # Day of week trend
        sns.boxplot(data=self.data, x='day_of_week', y=self.target_col, ax=axes[1, 1])
        axes[1, 1].set_title('Day of Week Trend')

        plt.tight_layout()
        plt.show()

    def run_full_analysis(self, holiday_dates):
        """
        Run the full analysis pipeline.
        :param holiday_dates: List of holiday dates for holiday analysis.
        """
        print("Aggregating data to monthly level...")
        monthly_data = self.aggregate_to_monthly()
        print(monthly_data)

        print("\nChecking for anomalies...")
        anomalies = self.check_anomalies()
        print(anomalies)

        print("\nAnalyzing trends around holidays...")
        holiday_trends = self.analyze_holiday_trends(holiday_dates)
        print(holiday_trends)

        print("\nVisualizing trends...")
        self.visualize_trends()

# Example Usage
if __name__ == "__main__":
    # Sample daily data
    data = pd.DataFrame({
        'pay_date': pd.date_range(start='2024-01-01', end='2024-12-31', freq='D'),
        'volume': np.random.randint(50, 500, size=366),
        'value': np.random.randint(1000, 10000, size=366)
    })

    # Define holiday dates
    holiday_dates = ['2024-01-01', '2024-12-25']

    # Initialize and run analysis for 'volume'
    print("EDA for Volume")
    eda_volume = EDAAggregator(data, 'volume')
    eda_volume.run_full_analysis(holiday_dates)

    # Initialize and run analysis for 'value'
    print("\nEDA for Value")
    eda_value = EDAAggregator(data, 'value')
    eda_value.run_full_analysis(holiday_dates)