In [5]:
import numpy as np
import pandas as pd
from factor_analyzer.factor_analyzer import calculate_kmo

In [6]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import glob
import re

# Base directory
base_dir = "/Users/laurenzschneeberger/Documents/Collection/Thesis/Code/Final_Data/Walkforward_Sets"

# Folders to process
folders = [
    "25_stocks_seed42",
    "50_stocks_seed42",
    "75_stocks_seed42",
    "100_stocks_seed42",
    "250_stocks_seed42"
]

# Function to calculate statistics for each dataframe
def calculate_stats(df):
    # 1. Average Mean
    avg_mean = df.mean().mean()
    
    # 2. Average Standard deviation
    avg_std = df.std().mean()
    
    # 3. Average Skewness
    avg_skew = df.skew().mean()
    
    # 4. Average Kurtosis
    avg_kurt = df.kurtosis().mean()
    
    # 5. Condition number
    # First check if the matrix is square
    if df.shape[0] >= df.shape[1]:
        # Use SVD to calculate condition number
        try:
            cond_num = np.linalg.cond(df.values)
        except:
            cond_num = np.nan
    else:
        cond_num = np.nan
    
    return {
        'Average Mean': avg_mean,
        'Average Std Dev': avg_std,
        'Average Skewness': avg_skew,
        'Average Kurtosis': avg_kurt,
        'Condition Number': cond_num
    }

# Function to extract the numeric prefix from filenames for sorting
def extract_prefix(filename):
    match = re.match(r'(\d+[a-z]?)_', filename)
    if match:
        return match.group(1)
    return filename

# Create Excel writer to save multiple sheets
with pd.ExcelWriter("descriptive_stats.xlsx") as writer:
    
    # Process each folder separately
    for folder in folders:
        folder_path = os.path.join(base_dir, folder)
        
        # Dictionaries to store in-sample and out-of-sample dataframes for this folder
        is_dfs = {}
        os_dfs = {}
        
        # Find all files ending with varresid_is.csv or varresid_os.csv
        is_files = glob.glob(os.path.join(folder_path, "*varresid_is.csv"))
        os_files = glob.glob(os.path.join(folder_path, "*varresid_os.csv"))
        
        # Process in-sample files
        for file_path in is_files:
            # Extract filename without extension to use as variable name
            file_name = os.path.basename(file_path).replace('.csv', '')
            
            # Read the CSV file
            df = pd.read_csv(file_path, index_col=0)
            
            # Store in dictionary
            is_dfs[file_name] = df
        
        # Process out-of-sample files
        for file_path in os_files:
            # Extract filename without extension to use as variable name
            file_name = os.path.basename(file_path).replace('.csv', '')
            
            # Read the CSV file
            df = pd.read_csv(file_path, index_col=0)
            
            # Store in dictionary
            os_dfs[file_name] = df
        
        # Calculate statistics for all in-sample dataframes in this folder
        is_results = {}
        for name, df in is_dfs.items():
            is_results[name] = calculate_stats(df)
        
        # Calculate statistics for all out-of-sample dataframes in this folder
        os_results = {}
        for name, df in os_dfs.items():
            os_results[name] = calculate_stats(df)
        
        # Convert results to dataframes
        is_results_df = pd.DataFrame.from_dict(is_results, orient='index')
        os_results_df = pd.DataFrame.from_dict(os_results, orient='index')
        
        # Sort the dataframes by the numeric prefix in the index
        is_results_df = is_results_df.loc[sorted(is_results_df.index, key=extract_prefix)]
        os_results_df = os_results_df.loc[sorted(os_results_df.index, key=extract_prefix)]
        
        # Create a combined dataframe with a header row for in-sample and out-of-sample
        combined_df = pd.DataFrame()
        
        # Add in-sample data with a header
        combined_df = pd.concat([combined_df, pd.DataFrame([["In-Sample Data"] + [""] * (len(is_results_df.columns) - 1)], 
                                                          columns=is_results_df.columns, 
                                                          index=[""])])
        combined_df = pd.concat([combined_df, is_results_df])
        
        # Add out-of-sample data with a header
        combined_df = pd.concat([combined_df, pd.DataFrame([["Out-of-Sample Data"] + [""] * (len(os_results_df.columns) - 1)], 
                                                          columns=os_results_df.columns, 
                                                          index=[""])])
        combined_df = pd.concat([combined_df, os_results_df])
        
        # Export to a sheet in the Excel file
        sheet_name = folder.replace('_stocks_seed42', '')  # Shorter sheet name
        combined_df.to_excel(writer, sheet_name=sheet_name)
        
        print(f"Processed {len(is_dfs)} in-sample and {len(os_dfs)} out-of-sample dataframes from folder {folder}")

print("All statistics have been exported to descriptive_stats.xlsx with separate sections for in-sample and out-of-sample data")


Processed 16 in-sample and 16 out-of-sample dataframes from folder 25_stocks_seed42
Processed 16 in-sample and 16 out-of-sample dataframes from folder 50_stocks_seed42
Processed 16 in-sample and 16 out-of-sample dataframes from folder 75_stocks_seed42
Processed 16 in-sample and 16 out-of-sample dataframes from folder 100_stocks_seed42
Processed 16 in-sample and 16 out-of-sample dataframes from folder 250_stocks_seed42
All statistics have been exported to descriptive_stats.xlsx with separate sections for in-sample and out-of-sample data
