Dangers of Walkforward OOS Test Results if there's data leakage.

In [None]:
import math
import numpy as np
import pandas as pd

# ------------------------------
# Normal CDF (accurate to ~7.5e-8)
# ------------------------------
def normal_cdf(z):
    zz = abs(z)
    pdf = math.exp(-0.5 * zz * zz) / math.sqrt(2.0 * math.pi)
    t = 1.0 / (1.0 + zz * 0.2316419)
    poly = ((((1.330274429 * t - 1.821255978) * t + 1.781477937) * t -
             0.356563782) * t + 0.319381530) * t
    return 1.0 - pdf * poly if z > 0.0 else pdf * poly

# ------------------------------
# Generate a random walk of log prices using NumPy.
# ------------------------------
def generate_random_walk(nprices):
    """
    Generate a random walk of log prices.
    x[0] = 0, then for each step:
      x[i] = x[i-1] + (u1 + u2 - u3 - u4)
    """
    # Generate (nprices-1) x 4 uniform random numbers in [0,1)
    increments = np.random.uniform(0, 1, (nprices - 1, 4))
    deltas = increments[:, 0] + increments[:, 1] - increments[:, 2] - increments[:, 3]
    x = np.empty(nprices)
    x[0] = 0.0
    x[1:] = np.cumsum(deltas)
    return x

# ------------------------------
# Compute dataset: indicator and target
#
# For each valid time index i, compute:
#   - indicator: the weighted linear slope over the window [i-lookback+1, i]
#   - target: price change from x[i] to x[i+lookahead]
# ------------------------------
def compute_dataset(x, lookback, lookahead):
    """
    Returns a pandas DataFrame with columns 'indicator' and 'target'
    computed over a sliding window.
    """
    n = len(x)
    # Valid indices: from (lookback-1) to (n - lookahead - 1)
    indices = np.arange(lookback - 1, n - lookahead)
    # Weights linearly spaced between -1 and 1
    weights = np.linspace(-1, 1, lookback)
    denom = np.sum(weights ** 2)
    
    indicators = []
    targets = []
    for i in indices:
        window = x[i - lookback + 1: i + 1]
        ind = np.dot(weights, window) / denom
        targ = x[i + lookahead] - x[i]
        indicators.append(ind)
        targets.append(targ)
        
    df = pd.DataFrame({'indicator': indicators, 'target': targets})
    return df

# ------------------------------
# Compute regression parameters (beta and constant)
# for a simple linear regression: target ~ indicator.
# ------------------------------
def regression_params(df):
    """
    Given a DataFrame with 'indicator' and 'target' columns,
    compute beta and constant so that:
       prediction = beta * indicator + constant.
    """
    x = df['indicator'].values
    y = df['target'].values
    xmean = np.mean(x)
    ymean = np.mean(y)
    xy = np.sum((x - xmean) * (y - ymean))
    xx = np.sum((x - xmean) ** 2)
    beta = xy / (xx + 1e-60)
    constant = ymean - beta * xmean
    return beta, constant

# ------------------------------
# Walkforward evaluation using pandas and numpy.
# ------------------------------
def walkforward_evaluation(df, ntrain, ntest, omit, extra):
    """
    Splits the dataset into sequential training and test folds.
    For each test case, compute the prediction via regression on the training fold.
    If the prediction is positive, use the target as-is; otherwise, flip its sign.
    Returns an array of out-of-sample (OOS) outcomes.
    """
    trn_ptr = 0
    ncases = len(df)
    oos_values = []
    
    while True:
        test_ptr = trn_ptr + ntrain
        if test_ptr >= ncases:
            break
        
        # Use training data (omit last 'omit' cases, if any)
        train_end = trn_ptr + (ntrain - omit) if (ntrain - omit) > 0 else trn_ptr + ntrain
        train_df = df.iloc[trn_ptr: train_end]
        beta, constant = regression_params(train_df)
        
        # Determine test fold length (could be truncated at the end)
        nt = ntest if test_ptr + ntest <= ncases else ncases - test_ptr
        
        test_df = df.iloc[test_ptr: test_ptr + nt].copy()
        test_df['pred'] = beta * test_df['indicator'] + constant
        # For each test case, use target or its negative based on prediction sign.
        fold_oos = np.where(test_df['pred'] > 0, test_df['target'], -test_df['target'])
        oos_values.extend(fold_oos.tolist())
        
        # Advance pointers for next fold
        trn_ptr += nt + extra
        if trn_ptr >= ncases:
            break
            
    return np.array(oos_values)


# Parameters:
#   nprices   : Total number of prices (bars in history)
#   lookback  : Window length for indicator computation
#   lookahead : Bars into future for target computation
#   ntrain    : Number of cases in training set
#   ntest     : Number of cases in test set
#   omit      : Number of cases to omit from end of training set
#   extra     : Extra bars jumped for the next fold
#   nreps     : Number of replications (forced to be odd)
#
# Returns a dictionary containing:
#   - n_OOS    : Last count of out-of-sample cases
#   - median_t : Median t-score across replications
#   - fraction : Fraction of replications with p-value <= 0.1
#   - t_scores : Array of t-scores from each replication
# ------------------------------
def run_overlap(nprices, lookback, lookahead, ntrain, ntest, omit, extra, nreps):
    # Ensure nreps is odd
    nreps = (nreps // 2) * 2 + 1

    # Check validity of parameters
    if (nprices < lookback + lookahead + ntrain + ntest + 10 or
        nprices < 2 or lookback < 2 or lookahead < 1 or
        ntrain < 2 or ntest < 1 or omit < 0 or extra < 0):
        raise ValueError("Invalid parameters. Ensure nprices >= lookback + lookahead + ntrain + ntest + 10 and other parameters are valid.")
    
    print(f"\nnprices={nprices}  lookback={lookback}  lookahead={lookahead}  ntrain={ntrain}  ntest={ntest}  omit={omit}  extra={extra}")
    
    t_scores = []
    p1_count = 0
    last_oos_count = None
    
    for irep in range(nreps):
        # Generate random walk of log prices.
        x = generate_random_walk(nprices)
        # Build the dataset (indicator and target).
        df = compute_dataset(x, lookback, lookahead)
        # Evaluate out-of-sample (OOS) outcomes.
        oos = walkforward_evaluation(df, ntrain, ntest, omit, extra)
        if len(oos) == 0:
            raise ValueError("No out-of-sample cases computed!")
        
        oos_mean = np.mean(oos)
        oos_std = np.std(oos, ddof=0)
        if oos_std < 1e-20:
            oos_std = 1e-20
        n_OOS = len(oos)
        t_score = math.sqrt(n_OOS) * oos_mean / oos_std
        rtail = 1.0 - normal_cdf(t_score)
        print(f"\nIteration {irep+1}: Mean = {oos_mean:.4f}  StdDev = {oos_std:.4f}  t = {t_score:.4f}  p = {rtail:.4f}")
        t_scores.append(t_score)
        if rtail <= 0.1:
            p1_count += 1
        last_oos_count = n_OOS

    t_scores = np.array(t_scores)
    median_t = np.median(t_scores)
    fraction = p1_count / nreps
    print(f"\nn OOS = {last_oos_count}  Median t = {median_t:.4f}  Fraction with p<= 0.1 = {fraction:.3f}")
    
    # Return the key results as a dictionary.
    return {"n_OOS": last_oos_count, "median_t": median_t, "fraction": fraction, "t_scores": t_scores}

# Example usage in a notebook cell:
# results = run_overlap(nprices=100000, lookback=1000, lookahead=1, ntrain=100, ntest=1, omit=0, extra=0, nreps=51)
# print(results)


Optimistic Bias from IS/OOS Overlap with Large Test Set

In [32]:
run_overlap(10000, 20, 5, 50, 50, 0, 0, 1001)


nprices=10000  lookback=20  lookahead=5  ntrain=50  ntest=50  omit=0  extra=0

Iteration 1: Mean = -0.0010  StdDev = 1.2617  t = -0.0762  p = 0.5304

Iteration 2: Mean = 0.0193  StdDev = 1.2963  t = 1.4838  p = 0.0689

Iteration 3: Mean = 0.0264  StdDev = 1.2518  t = 2.0990  p = 0.0179

Iteration 4: Mean = 0.0130  StdDev = 1.2844  t = 1.0088  p = 0.1565

Iteration 5: Mean = 0.0529  StdDev = 1.3103  t = 4.0190  p = 0.0000

Iteration 6: Mean = 0.0016  StdDev = 1.2848  t = 0.1258  p = 0.4499

Iteration 7: Mean = 0.0001  StdDev = 1.2869  t = 0.0115  p = 0.4954

Iteration 8: Mean = 0.0006  StdDev = 1.2967  t = 0.0430  p = 0.4829

Iteration 9: Mean = -0.0238  StdDev = 1.2727  t = -1.8621  p = 0.9687

Iteration 10: Mean = 0.0139  StdDev = 1.2876  t = 1.0794  p = 0.1402

Iteration 11: Mean = 0.0570  StdDev = 1.3059  t = 4.3511  p = 0.0000

Iteration 12: Mean = -0.0082  StdDev = 1.2667  t = -0.6464  p = 0.7410

Iteration 13: Mean = 0.0303  StdDev = 1.3167  t = 2.2919  p = 0.0110

Iteration 14:

{'n_OOS': 9926,
 'median_t': 0.5733346175010996,
 'fraction': 0.3596403596403596,
 't_scores': array([-0.07620695,  1.4837993 ,  2.0990322 , ...,  4.58394381,
        -2.2164894 ,  0.85669788])}