# Preprocessing

This notebook documents the creation of the aggregate-level VAR cleaned correlation and return matrices used to estimate the number of factors for the evaluation. It is not the same as the generation of the walkforward test sets which is in preprocessing_eval_walkforward.ipynb.

Input: cleaned no_multicoll return matrices 
Output: return and correlation matrices for (00, 05, 10, 15) timeframes and (25, 50, 75, 100) stocks

In [4]:
import pandas as pd
import numpy as np

from sklearn.covariance import LedoitWolf
from functions import random_subsample2

Set this parameter to (25, 50, 75, 100) and the correct filepath and then run the notebook.

In [5]:
stock_sample = 250

In [6]:
df00 = pd.read_csv('Final_Data/2000_2004_clean_nomulticoll.csv')
df00 = df00.pivot(index = 'date', columns = 'ticker', values = 'ret')

df05 = pd.read_csv('Final_Data/2005_2009_clean_nomulticoll.csv')

# Drop duplicates based on date and ticker combination
df05 = df05.drop_duplicates(subset=['date', 'ticker'], keep='first')
df05.reset_index(drop=True, inplace=True)
df05 = df05.pivot(index = 'date', columns = 'ticker', values = 'ret')

df10 = pd.read_csv('Final_Data/2010_2014_clean_nomulticoll.csv')
df10 = df10.pivot(index = 'date', columns = 'ticker', values = 'ret')

df15 = pd.read_csv('Final_Data/2015_2019_clean_nomulticoll.csv')
df15 = df15.pivot(index = 'date', columns = 'ticker', values = 'ret')

## Random Subsampling 

In [7]:
(df00, df05, df10, df15) = random_subsample2([df00, df05, df10, df15], n_samples=stock_sample, seed=42)

## Imputation

Analyze and impute missing data using a linear interpolation. 

In [8]:
def analyze_missing_data(df):
    """
    Comprehensive missing data analysis for a pandas DataFrame.
    """
    # Basic missing data info
    print("\n=== Missing Data Summary ===")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    
    summary = pd.DataFrame({
        'Missing Values': missing,
        'Missing %': missing_pct,
        'Data Type': df.dtypes
    }).sort_values('Missing %', ascending=False)
    
    print(summary[summary['Missing Values'] > 0])
    
    # Temporal patterns
    if 'date' in df.columns:
        print("\n=== Missing Values Over Time ===")
        missing_by_date = df.groupby('date').isnull().sum()
        print(f"Dates with most missing values:")
        print(missing_by_date.sum(axis=1).sort_values(ascending=False).head())
    
    # Missing value patterns
    print("\n=== Missing Value Patterns ===")
    print(f"Total missing values: {df.isnull().sum().sum()}")
    print(f"Rows with any missing values: {df.isnull().any(axis=1).sum()} ({df.isnull().any(axis=1).sum()/len(df)*100:.2f}%)")
    print(f"Columns with any missing values: {df.isnull().any(axis=0).sum()} ({df.isnull().any(axis=0).sum()/len(df.columns)*100:.2f}%)")
    
    return summary

print(analyze_missing_data(df00))
print(analyze_missing_data(df05))
print(analyze_missing_data(df10))
print(analyze_missing_data(df15))


=== Missing Data Summary ===
        Missing Values  Missing % Data Type
ticker                                     
WLP                 22   1.751592   float64
TGT                 19   1.512739   float64

=== Missing Value Patterns ===
Total missing values: 41
Rows with any missing values: 41 (3.26%)
Columns with any missing values: 2 (0.80%)
        Missing Values  Missing % Data Type
ticker                                     
WLP                 22   1.751592   float64
TGT                 19   1.512739   float64
CA                   0   0.000000   float64
SLE                  0   0.000000   float64
CMVT                 0   0.000000   float64
...                ...        ...       ...
RTN                  0   0.000000   float64
LU                   0   0.000000   float64
EMR                  0   0.000000   float64
PEG                  0   0.000000   float64
BSX                  0   0.000000   float64

[250 rows x 3 columns]

=== Missing Data Summary ===
        Missing Values  Mis

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def impute_returns(df, method='linear', max_gap=100, plot_sample=True):
    """
    Impute missing values in returns data using linear interpolation
    
    Parameters:
    df: pandas DataFrame with datetime index
    method: interpolation method ('linear', 'cubic', 'polynomial')
    max_gap: maximum number of consecutive missing values to interpolate
    plot_sample: whether to plot sample of interpolated values
    """
    
    # Store original missing value locations
    missing_mask = df.isnull()
    
    # Copy original data
    df_imputed = df.copy()
    
    # Interpolate within max_gap limit
    df_imputed = df_imputed.interpolate(
        method=method,
        limit=max_gap,
        limit_direction='both',
        axis=0
    )
    
    # Diagnostic information
    total_missing = missing_mask.sum().sum()
    still_missing = df_imputed.isnull().sum().sum()
    print(f"\nImputation Summary:")
    print(f"Original missing values: {total_missing}")
    print(f"Values imputed: {total_missing - still_missing}")
    print(f"Remaining missing values: {still_missing}")
    
    return df_imputed

df00 = impute_returns(df00)
df05 = impute_returns(df05)
df10 = impute_returns(df10)
df15 = impute_returns(df15)


Imputation Summary:
Original missing values: 41
Values imputed: 41
Remaining missing values: 0

Imputation Summary:
Original missing values: 91
Values imputed: 91
Remaining missing values: 0

Imputation Summary:
Original missing values: 21
Values imputed: 21
Remaining missing values: 0

Imputation Summary:
Original missing values: 150
Values imputed: 150
Remaining missing values: 0


## Splitting into Test Sets
We begin by splitting the data into 20% chunks to get 4 test periods. 

## VAR Model

For purposes of walk-forward testing, we estimate a VAR (VARMA?!) model on our training set and use the residuals as training data. Then we also use those parameters to take residuals of the test set, and use that data for testing. 

In [10]:
from statsmodels.tsa.api import VAR
import pandas as pd
import os

# Process in-sample residuals for all dataframes, including df15
datasets = {
    "df00": df00,
    "df05": df05,
    "df10": df10,
    "df15": df15
}
dfs = [df00, df05, df10, df15]
prefixes = ["df00", "df05", "df10", "df15"]

# Compute and save in-sample residuals for each dataframe
for df, prefix in zip(dfs, prefixes):
    model = VAR(df)
    results = model.fit(maxlags=1)
    resid_is = results.resid.copy()
    varname_is = f"{prefix}_varresid_is_{stock_sample}"
    globals()[varname_is] = resid_is
    filepath_is = os.path.join("Final_Data/Walkforward_Sets/aggregate_level", f"{varname_is}.csv")
    resid_is.to_csv(filepath_is)

# Compute and save out-of-sample residuals only for pairs (i.e. for df00, df05, df10)
for i in range(len(dfs) - 1):
    training = dfs[i]
    test = dfs[i + 1]
    model = VAR(training)
    results = model.fit(maxlags=1)

    # Prepare test lag by shifting one period
    test_lag = test.shift(1)
    test_lag.iloc[0] = training.iloc[-1]

    intercept = results.params.iloc[0]
    lag_coef = results.params.iloc[1]
    predicted = intercept + test_lag.multiply(lag_coef, axis=1)
    resid_os = test - predicted

    next_prefix = prefixes[i + 1]
    varname_os = f"{next_prefix}_varresid_os"
    globals()[varname_os] = resid_os
    filepath_os = os.path.join("Final_Data/Walkforward_Sets/aggregate_level", f"{varname_os}.csv")
    resid_os.to_csv(filepath_os)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


## Covariance Matrix Estimation with Ledoit-Wolf

In [11]:
from sklearn.covariance import ledoit_wolf
import numpy as np
import pandas as pd
import os

def cov_to_corr(cov):
    d = np.sqrt(np.diag(cov))
    return cov / np.outer(d, d)

def process_ledoit_wolf_correlations():
    dfs = [df00, df05, df10, df15]
    names = ['df00', 'df05', 'df10', 'df15']
    
    for df, name in zip(dfs, names):
        X = df.values
        
        # Compute Ledoit-Wolf covariance and correlation
        cov, shrinkage = ledoit_wolf(X)
        corr = cov_to_corr(cov)
        
        # Save correlation matrix
        corr_filename = f"Final_Data/Walkforward_Sets/aggregate_level/{name}_varresid_is_{stock_sample}_corr_{stock_sample}.csv"
        pd.DataFrame(corr, index=df.columns, columns=df.columns).to_csv(corr_filename)

process_ledoit_wolf_correlations()