In [7]:
import pandas as pd
from scipy.stats import zscore
import os
import matplotlib.pyplot as plt

def handle_outliers_iqr(data, columns):
    """
    Handles outliers using the IQR method by capping extreme values.

    Args:
        data (DataFrame): The dataset containing numeric columns.
        columns (list): List of column names to handle outliers.

    Returns:
        DataFrame: Dataset with outliers handled.
    """
    cleaned_data = data.copy()
    for column in columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = cleaned_data[column].quantile(0.25)
        Q3 = cleaned_data[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define the lower and upper bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Cap the outliers within the bounds
        cleaned_data[column] = cleaned_data[column].clip(lower=lower_bound, upper=upper_bound)

    return cleaned_data

def create_boxplots(data, numeric_columns, output_path):
    """
    Creates boxplots for numeric columns to visualize outliers.

    Args:
        data (DataFrame): The dataset containing numeric columns.
        numeric_columns (list): List of numeric columns to create boxplots for.
        output_path (str): Path to save the boxplot image.

    Returns:
        None
    """
    plt.figure(figsize=(15, 10))
    for i, column in enumerate(numeric_columns, 1):
        plt.subplot(3, 3, i)
        plt.boxplot(data[column], vert=False, patch_artist=True)
        plt.title(column)
        plt.xlabel('Value')

    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

def process_csv_files(input_folder, output_folder, numeric_columns):
    """
    Processes multiple CSV files to handle outliers and save cleaned files.

    Args:
        input_folder (str): Path to the folder containing input CSV files.
        output_folder (str): Path to the folder to save cleaned CSV files.
        numeric_columns (list): List of numeric columns to process for outliers.

    Returns:
        None
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over all CSV files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            input_path = os.path.join(input_folder, filename)
            cleaned_output_name = f"cleaned_{filename}"
            output_path = os.path.join(output_folder, cleaned_output_name)

            # Load the CSV file
            data = pd.read_csv(input_path)

            # Create and save boxplots for the original data
            boxplot_path = os.path.join(output_folder, f"{filename}_boxplot.png")
            create_boxplots(data, numeric_columns, boxplot_path)

            # Handle outliers
            cleaned_data = handle_outliers_iqr(data, numeric_columns)

            # Save the cleaned data to a new CSV file
            cleaned_data.to_csv(output_path, index=False)

            print(f"Processed and saved cleaned file: {cleaned_output_name}")


# Apply handling outliers for min-max-scaled datasets
min_max_input_folder = '../data/min_max_scaling'
min_max_output_folder = '../data/min_max_scaling'
numeric_columns = ['GDP Growth', 'CPI', 'Interest Rate', 'M2 Money Supply', 'PPI', 'Unemployment Rate', 'VIX_Close']

process_csv_files(min_max_input_folder, min_max_output_folder, numeric_columns)

# Apply handling outliers for standardscaled datasets
standardscaled_input_folder = '../data/standardscaler'
standardscaled_output_folder = '../data/standardscaler'
numeric_columns = ['GDP Growth', 'CPI', 'Interest Rate', 'M2 Money Supply', 'PPI', 'Unemployment Rate', 'VIX_Close']

process_csv_files(standardscaled_input_folder, standardscaled_output_folder, numeric_columns)

Processed and saved cleaned file: cleaned_minmax_scaled_combined_data_nasdaq.csv
Processed and saved cleaned file: cleaned_minmax_scaled_combined_data_sp500.csv
Processed and saved cleaned file: cleaned_normalized_combined_data_nasdaq.csv
Processed and saved cleaned file: cleaned_normalized_combined_data_sp500.csv
