In [5]:
import pandas as pd
import os

# Define file paths from user-uploaded files
file_paths = [
    '../data/min_max_scaling/cleaned_minmax_scaled_combined_data_nasdaq.csv',
    '../data/min_max_scaling/cleaned_minmax_scaled_combined_data_sp500.csv',
    '../data/standardscaler/cleaned_normalized_combined_data_sp500.csv',
    '../data/standardscaler/cleaned_normalized_combined_data_nasdaq.csv',
]

# Function to compute statistics grouped by Market_Label
# Update the function to exclude non-numeric columns
def compute_statistics(file_paths, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

    results = {}
    for file_path in file_paths:
        try:
            # Load the CSV file
            if not os.path.exists(file_path):
                print(f"File not found: {file_path}")
                continue

            data = pd.read_csv(file_path)
            
            if 'Market_Label' not in data.columns:
                print(f"Missing 'Market_Label' column in {file_path}")
                continue

            # File-specific identifier for naming
            file_name = os.path.basename(file_path).split('.')[0]

            # Select numeric columns only for aggregation
            numeric_data = data.select_dtypes(include=['number'])

            # Ensure Market_Label is in the data for grouping
            if 'Market_Label' not in numeric_data.columns:
                numeric_data['Market_Label'] = data['Market_Label']

            # Group by Market_Label and compute statistics
            grouped_stats = numeric_data.groupby('Market_Label').agg(['mean', 'median', 'std'])

            # Flatten MultiIndex columns for easier interpretation
            grouped_stats.columns = ['_'.join(col) for col in grouped_stats.columns]

            # Save the statistics as a CSV file
            output_file = os.path.join(output_dir, f"{file_name}_grouped_statistics.csv")
            grouped_stats.to_csv(output_file)
            
            # Save the result in memory for analysis
            results[file_name] = grouped_stats

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    return results

# Define the output directory
output_dir = '../data/EDA'

# Perform the computation again
results = compute_statistics(file_paths, output_dir)

# Display one of the resulting statistics as an example
if results:
    print(results[list(results.keys())[0]].head())


              GDP Growth_mean  GDP Growth_median  GDP Growth_std  CPI_mean  \
Market_Label                                                                 
-1                   0.645897           0.689830        0.157979  0.478745   
 1                   0.502338           0.472063        0.258224  0.390279   

              CPI_median   CPI_std  Interest Rate_mean  Interest Rate_median  \
Market_Label                                                                   
-1              0.494612  0.148301            0.586914              0.609250   
 1              0.377335  0.122223            0.421155              0.392344   

              Interest Rate_std  M2 Money Supply_mean  ...  \
Market_Label                                           ...   
-1                     0.217035              0.164991  ...   
 1                     0.204279              0.145690  ...   

              M2 Money Supply_std  PPI_mean  PPI_median   PPI_std  \
Market_Label                                    