In [9]:
"""
Unbiased Advanced Pattern Evolution Predictor
==============================================
Rigorous, statistically-validated pattern detection with minimal bias.
All thresholds are data-driven, not hardcoded.
"""

import numpy as np
import pandas as pd
from scipy import stats, signal, optimize
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from typing import Dict, List, Tuple, Optional, Any
import warnings
warnings.filterwarnings('ignore')

class UnbiasedPatternPredictor:
    """
    Statistically rigorous pattern predictor with minimal bias.
    All parameters are discovered from data, not assumed.
    """

    def __init__(self,
                 confidence_level: float = 0.95,
                 min_sample_size: int = 100,
                 bootstrap_iterations: int = 1000):
        """
        Initialize with statistical parameters only.

        Args:
            confidence_level: Statistical confidence level for tests
            min_sample_size: Minimum data points for valid analysis
            bootstrap_iterations: Number of bootstrap samples for confidence intervals
        """
        # Statistical parameters
        self.confidence_level = confidence_level
        self.alpha = 1 - confidence_level
        self.min_sample_size = min_sample_size
        self.bootstrap_iterations = bootstrap_iterations

        # Discovered parameters (will be populated from data)
        self.discovered_constants = {}
        self.discovered_windows = []
        self.discovered_lags = []
        self.discovered_attractors = {}
        self.empirical_thresholds = {}

        # Known mathematical constants for reference (not assumed to appear)
        self.MATHEMATICAL_CONSTANTS = {
            'phi': 1.618033988749895,
            'inv_phi': 0.618033988749895,
            'inv_phi_squared': 0.381966011250105,
            'pi': 3.141592653589793,
            'e': 2.718281828459045,
            'sqrt_2': 1.414213562373095,
            'sqrt_3': 1.732050807568877,
            'feigenbaum': 4.669201609102990,
            'fine_structure': 137.035999084,
            'inv_fine_structure': 0.007297352566
        }

        # Phase states (discovered empirically)
        self.phases = {}

    def discover_parameters(self, data: pd.DataFrame) -> Dict:
        """
        Discover all parameters from the data without assumptions.
        """
        print("Discovering parameters from data...")

        # Ensure we have required columns
        if 'Close' not in data.columns:
            raise ValueError("Data must contain 'Close' column")

        # Calculate returns if not present
        if 'returns' not in data.columns:
            data['returns'] = data['Close'].pct_change()

        discoveries = {
            'windows': self._discover_optimal_windows(data),
            'lags': self._discover_significant_lags(data),
            'attractors': self._discover_attractors(data),
            'thresholds': self._discover_thresholds(data),
            'constants': self._discover_constants(data),
            'phase_boundaries': self._discover_phase_boundaries(data)
        }

        # Store discoveries
        self.discovered_windows = discoveries['windows']['significant_windows']
        self.discovered_lags = discoveries['lags']['significant_lags']
        self.discovered_attractors = discoveries['attractors']['attractors']
        self.empirical_thresholds = discoveries['thresholds']
        self.discovered_constants = discoveries['constants']

        return discoveries

    def _discover_optimal_windows(self, data: pd.DataFrame) -> Dict:
        """
        Discover which window sizes show statistically significant patterns.
        No assumption about 729, 137, etc.
        """
        prices = data['Close'].values
        n = len(prices)

        # Test a wide range of windows
        min_window = max(10, n // 100)
        max_window = min(n // 2, 1000)
        test_windows = np.unique(np.logspace(np.log10(min_window),
                                            np.log10(max_window),
                                            50).astype(int))

        window_scores = {}
        window_pvalues = {}

        for window in test_windows:
            if window >= n:
                continue

            # Calculate pattern strength for this window
            densities = []
            for i in range(0, n - window, window // 4):
                segment = prices[i:i+window]
                if len(segment) > 1:
                    binary = (np.diff(segment) > 0).astype(int)
                    densities.append(np.mean(binary))

            if len(densities) < 3:
                continue

            # Test for non-randomness using multiple tests
            densities = np.array(densities)

            # 1. Runs test for randomness
            median_density = np.median(densities)
            runs = self._count_runs(densities > median_density)
            expected_runs = (2 * np.sum(densities > median_density) *
                           np.sum(densities <= median_density) / len(densities)) + 1

            if expected_runs > 0:
                z_score = (runs - expected_runs) / np.sqrt(expected_runs)
                p_value_runs = 2 * (1 - stats.norm.cdf(abs(z_score)))
            else:
                p_value_runs = 1.0

            # 2. Kolmogorov-Smirnov test against uniform
            _, p_value_ks = stats.kstest(densities, 'uniform', args=(0, 1))

            # Combine p-values using Fisher's method (excluding AD test)
            combined_stat = -2 * (np.log(p_value_runs + 1e-10) +
                                 np.log(p_value_ks + 1e-10))
            # Degrees of freedom reduced from 6 to 4 (removed AD test)
            combined_pvalue = 1 - stats.chi2.cdf(combined_stat, df=4)

            window_scores[window] = 1 - combined_pvalue  # Higher score = more significant
            window_pvalues[window] = combined_pvalue

        # Apply multiple testing correction
        from statsmodels.stats.multitest import multipletests
        windows = list(window_pvalues.keys())
        pvalues = list(window_pvalues.values())

        significant_windows = []
        corrected_pvalues = {}

        # Add check for sufficient p-values
        if len(pvalues) >= 2: # fdr_bh requires at least 2 p-values
            rejected, corrected_pvalues_array, _, _ = multipletests(pvalues,
                                                             alpha=self.alpha,
                                                             method='fdr_bh')
            significant_windows = [w for w, r in zip(windows, rejected) if r.any()]
            corrected_pvalues = dict(zip(windows, corrected_pvalues_array))


        # Special test for 729 if it's in range
        special_729_result = None
        if 729 < n:
            special_729_result = self._test_specific_window(data, 729)


        return {
            'significant_windows': significant_windows,
            'window_scores': window_scores,
            'corrected_pvalues': corrected_pvalues,
            'special_729': special_729_result,
            'best_window': max(window_scores.items(), key=lambda x: x[1])[0] if window_scores else None
        }

    def _discover_significant_lags(self, data: pd.DataFrame) -> Dict:
        """
        Discover autocorrelation structure without assuming specific lags.
        """
        returns = data['returns'].dropna().values

        if len(returns) < self.min_sample_size:
            return {'significant_lags': [], 'acf_values': {}}

        # Calculate autocorrelation for many lags
        max_lag = min(len(returns) // 4, 100)

        from statsmodels.tsa.stattools import acf, pacf
        acf_values, acf_confint = acf(returns, nlags=max_lag, alpha=self.alpha)
        pacf_values, pacf_confint = pacf(returns, nlags=max_lag, alpha=self.alpha)

        # Find significant lags (outside confidence intervals)
        significant_lags = []
        lag_strengths = {}

        for lag in range(1, len(acf_values)):
            # Check if significantly different from zero
            lower = acf_confint[lag, 0]
            upper = acf_confint[lag, 1]

            if acf_values[lag] < lower or acf_values[lag] > upper:
                significant_lags.append(lag)
                lag_strengths[lag] = abs(acf_values[lag])

        # Look for lag differences (like 45-42=3 from original)
        lag_differences = {}
        for i, lag1 in enumerate(significant_lags):
            for lag2 in significant_lags[i+1:]:
                diff = lag2 - lag1
                if diff not in lag_differences:
                    lag_differences[diff] = []
                lag_differences[diff].append((lag1, lag2))

        return {
            'significant_lags': significant_lags,
            'acf_values': dict(enumerate(acf_values)),
            'pacf_values': dict(enumerate(pacf_values)),
            'lag_strengths': lag_strengths,
            'lag_differences': lag_differences,
            'strongest_lag': max(lag_strengths.items(), key=lambda x: x[1])[0] if lag_strengths else None
        }

    def _discover_attractors(self, data: pd.DataFrame) -> Dict:
        """
        Discover natural clustering points in the data.
        """
        prices = data['Close'].values

        # Calculate various ratios
        ratios = []

        # Price ratios
        for shift in [1, 5, 20]:
            if len(prices) > shift:
                ratio = prices[shift:] / prices[:-shift]
                ratios.extend(ratio[np.isfinite(ratio)])

        # Convert to numpy array
        ratios = np.array(ratios)
        ratios = ratios[(ratios > 0.1) & (ratios < 10)]  # Remove extreme outliers

        if len(ratios) < self.min_sample_size:
            return {'attractors': {}, 'cluster_centers': []}

        # Use Gaussian Mixture Model to find natural clusters
        from sklearn.mixture import GaussianMixture

        # Determine optimal number of clusters using BIC
        n_components_range = range(1, min(10, len(ratios) // 20))
        bic_scores = []

        for n_components in n_components_range:
            gmm = GaussianMixture(n_components=n_components, random_state=42)
            gmm.fit(ratios.reshape(-1, 1))
            bic_scores.append(gmm.bic(ratios.reshape(-1, 1)))

        # Find optimal number of clusters
        optimal_n = n_components_range[np.argmin(bic_scores)]

        # Fit final model
        gmm = GaussianMixture(n_components=optimal_n, random_state=42)
        gmm.fit(ratios.reshape(-1, 1))

        cluster_centers = gmm.means_.flatten()

        # Check which mathematical constants are near cluster centers
        discovered_attractors = {}
        for center in cluster_centers:
            # Check proximity to known constants
            for name, value in self.MATHEMATICAL_CONSTANTS.items():
                if abs(center - value) < 0.1:  # Within 10%
                    discovered_attractors[f"near_{name}"] = center
                    break
            else:
                # Not near any known constant
                discovered_attractors[f"empirical_{center:.3f}"] = center

        return {
            'attractors': discovered_attractors,
            'cluster_centers': cluster_centers.tolist(),
            'n_clusters': optimal_n,
            'bic_score': min(bic_scores) if bic_scores else None
        }

    def _discover_thresholds(self, data: pd.DataFrame) -> Dict:
        """
        Discover empirical thresholds from data distribution.
        """
        returns = data['returns'].dropna().values

        if len(returns) < self.min_sample_size:
            return {}

        # Calculate percentiles for various confidence levels
        thresholds = {
            'extreme_positive': np.percentile(returns, 99),
            'high_positive': np.percentile(returns, 95),
            'moderate_positive': np.percentile(returns, 75),
            'neutral_high': np.percentile(returns, 55),
            'neutral_low': np.percentile(returns, 45),
            'moderate_negative': np.percentile(returns, 25),
            'high_negative': np.percentile(returns, 5),
            'extreme_negative': np.percentile(returns, 1),
            'volatility_threshold': np.std(returns),
            'mean_return': np.mean(returns)
        }

        return thresholds

    def _discover_constants(self, data: pd.DataFrame) -> Dict:
        """
        Test which mathematical constants appear more than random chance.
        """
        prices = data['Close'].values

        # Calculate various metrics
        metrics = []

        # Density at different windows
        for window in [20, 50, 100, 200]:
            if len(prices) > window:
                binary = (np.diff(prices[-window:]) > 0).astype(int)
                metrics.append(np.mean(binary))

        # Volatility ratios
        for period1, period2 in [(20, 50), (50, 100), (20, 100)]:
            if len(prices) > period2:
                vol1 = np.std(prices[-period1:])
                vol2 = np.std(prices[-period2:])
                if vol2 > 0:
                    metrics.append(vol1 / vol2)

        # Mean ratios
        if len(prices) > 100:
            metrics.append(np.mean(prices[-50:]) / np.mean(prices[-100:]))

        metrics = np.array(metrics)
        metrics = metrics[np.isfinite(metrics)]

        if len(metrics) < 5:
            return {}

        # Test proximity to mathematical constants
        constant_distances = {}
        for name, value in self.MATHEMATICAL_CONSTANTS.items():
            distances = np.abs(metrics - value)
            min_distance = np.min(distances)

            # Bootstrap test for significance
            bootstrap_distances = []
            for _ in range(self.bootstrap_iterations):
                random_metrics = np.random.uniform(np.min(metrics), np.max(metrics), len(metrics)) # Sample from metric range
                bootstrap_distances.append(np.min(np.abs(random_metrics - value)))

            # Calculate p-value
            p_value = np.mean(np.array(bootstrap_distances) <= min_distance)

            constant_distances[name] = {
                'min_distance': min_distance,
                'p_value': p_value,
                'is_significant': p_value < self.alpha
            }

        return constant_distances

    def _discover_phase_boundaries(self, data: pd.DataFrame) -> Dict:
        """
        Discover natural phase transitions in the data.
        """
        returns = data['returns'].dropna().values

        if len(returns) < self.min_sample_size:
            return {}

        # Use change point detection
        from scipy.signal import find_peaks

        # Calculate rolling statistics
        window = 20
        if len(returns) > window:
            rolling_mean = pd.Series(returns).rolling(window).mean().values
            rolling_std = pd.Series(returns).rolling(window).std().values
            rolling_skew = pd.Series(returns).rolling(window).skew().values

            # Combine into phase indicator
            phase_indicator = np.nanmean([
                np.abs(np.gradient(rolling_mean)),
                np.abs(np.gradient(rolling_std)),
                np.abs(np.gradient(rolling_skew))
            ], axis=0)

            # Find peaks in phase indicator (phase transitions)
            peaks, properties = find_peaks(phase_indicator[~np.isnan(phase_indicator)],
                                         prominence=np.nanstd(phase_indicator))

            # Classify phases between transitions
            n_phases = len(peaks) + 1
            phase_boundaries = peaks.tolist() if len(peaks) > 0 else []

        else:
            phase_boundaries = []
            n_phases = 1

        return {
            'phase_boundaries': phase_boundaries,
            'n_phases': n_phases,
            'phase_duration_mean': len(returns) / (n_phases + 1)
        }

    def _test_specific_window(self, data: pd.DataFrame, window: int) -> Dict:
        """
        Special test for specific window (like 729).
        """
        prices = data['Close'].values

        if len(prices) < window:
            return None

        # Calculate densities
        densities = []
        for i in range(0, len(prices) - window, window // 4):
            segment = prices[i:i+window]
            binary = (np.diff(segment) > 0).astype(int)
            densities.append(np.mean(binary))

        if len(densities) < 3:
            return None

        densities = np.array(densities)

        # Test convergence to specific values
        results = {}
        for name, value in self.MATHEMATICAL_CONSTANTS.items():
            if 0 <= value <= 1:  # Only test density-compatible values
                distances = np.abs(densities - value)

                # Bootstrap test
                bootstrap_means = []
                for _ in range(self.bootstrap_iterations):
                    random_densities = np.random.uniform(0, 1, len(densities))
                    bootstrap_means.append(np.mean(np.abs(random_densities - value)))

                actual_mean = np.mean(distances)
                p_value = np.mean(np.array(bootstrap_means) <= actual_mean)

                results[name] = {
                    'mean_distance': actual_mean,
                    'p_value': p_value,
                    'converges': p_value < self.alpha
                }

        return results

    def analyze_with_discoveries(self, data: pd.DataFrame,
                                discoveries: Optional[Dict] = None) -> Dict:
        """
        Analyze data using discovered parameters.
        """
        if discoveries is None:
            discoveries = self.discover_parameters(data)

        results = {}

        # 1. Window-based analysis
        if self.discovered_windows:
            results['window_analysis'] = self._analyze_discovered_windows(data)

        # 2. Lag-based predictions
        if self.discovered_lags:
            results['lag_predictions'] = self._generate_lag_predictions(data)

        # 3. Attractor dynamics
        if self.discovered_attractors:
            results['attractor_state'] = self._analyze_attractor_state(data)

        # 4. Phase analysis
        results['phase_analysis'] = self._analyze_phases(data)

        # 5. Statistical validation
        results['validation'] = self._validate_patterns(data)

        return results

    def _analyze_discovered_windows(self, data: pd.DataFrame) -> Dict:
        """
        Analyze patterns using discovered optimal windows.
        """
        prices = data['Close'].values
        window_results = {}

        for window in self.discovered_windows[:5]:  # Top 5 windows
            if window >= len(prices):
                continue

            # Calculate pattern strength
            segment = prices[-window:]
            binary = (np.diff(segment) > 0).astype(int)
            density = np.mean(binary)

            # Calculate trend
            x = np.arange(len(segment))
            slope, intercept = np.polyfit(x, segment, 1)

            window_results[window] = {
                'density': density,
                'trend': slope,
                'volatility': np.std(segment) / np.mean(segment)
            }

        return window_results

    def _generate_lag_predictions(self, data: pd.DataFrame) -> Dict:
        """
        Generate predictions using discovered lag structure.
        """
        returns = data['returns'].dropna().values

        if len(returns) < max(self.discovered_lags + [0]) + 10:
            return {}

        # Use Random Forest with discovered lags as features
        features = []
        for i in range(max(self.discovered_lags), len(returns)):
            feature_vector = [returns[i - lag] for lag in self.discovered_lags]
            features.append(feature_vector)

        features = np.array(features)
        targets = returns[max(self.discovered_lags):]

        # Time series cross-validation
        tscv = TimeSeriesSplit(n_splits=5)
        model = RandomForestRegressor(n_estimators=100, random_state=42)

        scores = cross_val_score(model, features[:-1], targets[1:],
                               cv=tscv, scoring='neg_mean_squared_error')

        # Fit on all data for final prediction
        model.fit(features[:-1], targets[1:])
        next_features = features[-1].reshape(1, -1)
        prediction = model.predict(next_features)[0]

        # Calculate confidence interval using random forest's predictions
        tree_predictions = np.array([tree.predict(next_features)[0]
                                    for tree in model.estimators_])
        confidence_interval = np.percentile(tree_predictions, [2.5, 97.5])

        return {
            'prediction': prediction,
            'confidence_interval': confidence_interval.tolist(),
            'cv_score': -np.mean(scores),
            'feature_importance': dict(zip(self.discovered_lags,
                                          model.feature_importances_))
        }

    def _analyze_attractor_state(self, data: pd.DataFrame) -> Dict:
        """
        Determine current position relative to discovered attractors.
        """
        prices = data['Close'].values

        if len(prices) < 20:
            return {}

        # Calculate current ratio
        current_ratio = prices[-1] / prices[-20] if prices[-20] != 0 else 1

        # Find nearest attractor
        distances = {}
        for name, value in self.discovered_attractors.items():
            distances[name] = abs(current_ratio - value)

        if distances:
            nearest = min(distances.items(), key=lambda x: x[1])

            # Calculate approach velocity
            if len(prices) > 40:
                prev_ratio = prices[-20] / prices[-40] if prices[-40] != 0 else 1
                velocity = (current_ratio - prev_ratio) / 20
            else:
                velocity = 0

            return {
                'nearest_attractor': nearest[0],
                'distance': nearest[1],
                'current_ratio': current_ratio,
                'approach_velocity': velocity,
                'converging': velocity * nearest[1] < 0  # Negative = approaching
            }

        return {}

    def _analyze_phases(self, data: pd.DataFrame) -> Dict:
        """
        Determine current phase using discovered boundaries.
        """
        returns = data['returns'].dropna().values

        if len(returns) < 50:
            return {'phase': 'insufficient_data'}

        # Calculate current volatility regime
        recent_vol = np.std(returns[-20:])
        historical_vol = np.std(returns)
        vol_ratio = recent_vol / historical_vol if historical_vol > 0 else 1

        # Classify phase based on volatility ratio
        if vol_ratio < 0.5:
            phase = 'low_volatility'
            confidence = 0.8
        elif vol_ratio < 0.8:
            phase = 'decreasing_volatility'
            confidence = 0.6
        elif vol_ratio < 1.2:
            phase = 'normal_volatility'
            confidence = 0.5
        elif vol_ratio < 1.5:
            phase = 'increasing_volatility'
            confidence = 0.6
        else:
            phase = 'high_volatility'
            confidence = 0.8

        return {
            'phase': phase,
            'volatility_ratio': vol_ratio,
            'confidence': confidence
        }

    def _validate_patterns(self, data: pd.DataFrame) -> Dict:
        """
        Validate discovered patterns using out-of-sample testing.
        """
        prices = data['Close'].values

        if len(prices) < 200:
            return {'validated': False, 'reason': 'insufficient_data'}

        # Split data
        split_point = len(prices) * 3 // 4
        train_data = prices[:split_point]
        test_data = prices[split_point:]

        # Test if patterns hold out-of-sample
        train_stats = {
            'mean': np.mean(train_data),
            'std': np.std(train_data),
            'skew': stats.skew(train_data),
            'kurtosis': stats.kurtosis(train_data)
        }

        test_stats = {
            'mean': np.mean(test_data),
            'std': np.std(test_data),
            'skew': stats.skew(test_data),
            'kurtosis': stats.kurtosis(test_data)
        }

        # Calculate similarity
        similarity = 1 - np.mean([
            abs(train_stats['mean'] - test_stats['mean']) / (abs(train_stats['mean']) + 1e-10),
            abs(train_stats['std'] - test_stats['std']) / (train_stats['std'] + 1e-10),
            abs(train_stats['skew'] - test_stats['skew']) / (abs(train_stats['skew']) + 1e-10),
            abs(train_stats['kurtosis'] - test_stats['kurtosis']) / (abs(train_stats['kurtosis']) + 1e-10)
        ])

        return {
            'validated': similarity > 0.7,
            'similarity_score': similarity,
            'train_stats': train_stats,
            'test_stats': test_stats
        }

    def generate_trading_signals(self, data: pd.DataFrame,
                                analysis: Optional[Dict] = None) -> Dict:
        """
        Generate trading signals based on analysis.
        """
        if analysis is None:
            discoveries = self.discover_parameters(data)
            analysis = self.analyze_with_discoveries(data, discoveries)

        signals = []
        confidences = []

        # Combine multiple signal sources
        signal_components = []

        # 1. Lag predictions
        if 'lag_predictions' in analysis and analysis['lag_predictions']:
            pred = analysis['lag_predictions']['prediction']
            if pred > self.empirical_thresholds.get('moderate_positive', 0.01):
                signal_components.append((1, 0.6))
            elif pred < self.empirical_thresholds.get('moderate_negative', -0.01):
                signal_components.append((-1, 0.6))
            else:
                signal_components.append((0, 0.3))

        # 2. Attractor state
        if 'attractor_state' in analysis and analysis['attractor_state']:
            state = analysis['attractor_state']
            if state.get('converging') and state.get('distance', 1) < 0.1:
                # Near attractor and converging
                if 'near_unity' in state.get('nearest_attractor', ''):
                    # Mean reversion
                    current = state.get('current_ratio', 1)
                    signal_components.append((np.sign(1 - current), 0.5))
                else:
                    signal_components.append((0, 0.2))

        # 3. Phase analysis
        if 'phase_analysis' in analysis and analysis['phase_analysis']:
            phase = analysis['phase_analysis']['phase']
            conf = analysis['phase_analysis']['confidence']

            if phase == 'low_volatility':
                signal_components.append((0, conf * 0.5))  # Reduce position in low vol
            elif phase == 'high_volatility':
                signal_components.append((0, conf * 0.3))  # Be cautious in high vol
            else:
                signal_components.append((0, conf * 0.4))

        # Combine signals weighted by confidence
        if signal_components:
            total_weight = sum(c for _, c in signal_components)
            if total_weight > 0:
                final_signal = sum(s * c for s, c in signal_components) / total_weight
                final_confidence = np.mean([c for _, c in signal_components])
            else:
                final_signal = 0
                final_confidence = 0
        else:
            final_signal = 0
            final_confidence = 0

        # Generate signal array (simplified for single prediction)
        n_signals = min(len(data), 10)
        signals = [final_signal] * n_signals
        confidences = [final_confidence] * n_signals

        return {
            'signals': signals,
            'confidences': confidences,
            'mean_confidence': final_confidence,
            'signal_components': signal_components,
            'recommendation': self._get_recommendation(final_signal, final_confidence)
        }

    def _get_recommendation(self, signal: float, confidence: float) -> str:
        """
        Generate human-readable recommendation.
        """
        if confidence < 0.3:
            return "No clear signal - stay out of market"
        elif confidence < 0.5:
            if abs(signal) < 0.5:
                return "Weak signal - consider small position"
            else:
                return f"Moderate {'buy' if signal > 0 else 'sell'} signal with low confidence"
        else:
            if abs(signal) < 0.3:
                return "Neutral market - no strong directional bias"
            elif signal > 0:
                return f"Buy signal with {confidence:.1%} confidence"
            else:
                return f"Sell signal with {confidence:.1%} confidence"

    def _count_runs(self, binary_sequence: np.ndarray) -> int:
        """
        Count runs in a binary sequence.
        """
        if len(binary_sequence) == 0:
            return 0
        runs = 1
        for i in range(1, len(binary_sequence)):
            if binary_sequence[i] != binary_sequence[i-1]:
                runs += 1
        return runs

    def backtest(self, data: pd.DataFrame) -> Dict:
        """
        Perform walk-forward backtesting with proper statistical validation.
        """
        prices = data['Close'].values
        returns = data['returns'].values

        if len(prices) < 200:
            return {'error': 'Insufficient data for backtesting'}

        # Walk-forward analysis
        window_size = 100
        step_size = 20

        results = []

        for i in range(window_size, len(prices) - step_size, step_size):
            # Train on data up to i
            train_data = data.iloc[:i]

            # Test on next step_size periods
            test_data = data.iloc[i:i+step_size]

            # Discover parameters on training data
            discoveries = self.discover_parameters(train_data)

            # Analyze and generate signals
            analysis = self.analyze_with_discoveries(train_data, discoveries)
            signals_dict = self.generate_trading_signals(train_data, analysis)

            # Apply to test data
            if signals_dict['signals']:
                signal = signals_dict['signals'][0]
                test_returns = test_data['returns'].values

                # Calculate strategy returns
                strategy_returns = signal * test_returns

                results.append({
                    'period_return': np.sum(strategy_returns),
                    'buy_hold_return': np.sum(test_returns),
                    'signal': signal,
                    'confidence': signals_dict['mean_confidence']
                })

        if not results:
            return {'error': 'No valid backtest periods'}

        # Calculate performance metrics
        period_returns = [r['period_return'] for r in results]
        buy_hold_returns = [r['buy_hold_return'] for r in results]

        strategy_cumulative = np.cumprod(1 + np.array(period_returns)) - 1
        buy_hold_cumulative = np.cumprod(1 + np.array(buy_hold_returns)) - 1

        # Sharpe ratio
        strategy_sharpe = np.mean(period_returns) / (np.std(period_returns) + 1e-10)
        buy_hold_sharpe = np.mean(buy_hold_returns) / (np.std(buy_hold_returns) + 1e-10)

        # Win rate
        win_rate = np.mean([r['period_return'] > 0 for r in results])

        # Statistical significance test
        t_stat, p_value = stats.ttest_rel(period_returns, buy_hold_returns)

        return {
            'strategy_return': strategy_cumulative[-1] if len(strategy_cumulative) > 0 else 0,
            'buy_hold_return': buy_hold_cumulative[-1] if len(buy_hold_cumulative) > 0 else 0,
            'strategy_sharpe': strategy_sharpe,
            'buy_hold_sharpe': buy_hold_sharpe,
            'win_rate': win_rate,
            'n_periods': len(results),
            'outperformance': strategy_cumulative[-1] - buy_hold_cumulative[-1] if len(strategy_cumulative) > 0 else 0,
            'statistical_significance': {
                't_statistic': t_stat,
                'p_value': p_value,
                'is_significant': p_value < self.alpha
            },
            'mean_confidence': np.mean([r['confidence'] for r in results])
        }


def run_unbiased_analysis(symbol: str = 'SPY',
                         start: str = '2020-01-01',
                         end: str = '2024-10-31',
                         show_details: bool = True) -> Dict:
    """
    Run complete unbiased analysis on market data.
    """
    print("\n" + "="*70)
    print("UNBIASED PATTERN DISCOVERY AND ANALYSIS")
    print("="*70)

    # Get data
    try:
        import yfinance as yf
        print(f"\nFetching data for {symbol}...")
        data = yf.download(symbol, start=start, end=end, progress=False)
        data['returns'] = data['Close'].pct_change()
        print(f"✓ Loaded {len(data)} days of data")
    except Exception as e:
        print(f"Error loading data: {e}")
        print("Generating synthetic data for demonstration...")
        dates = pd.date_range(start=start, end=end, freq='D')
        n = len(dates)

        # Generate more realistic synthetic data
        returns = np.random.normal(0.0005, 0.02, n)
        returns[::45] += np.random.normal(0, 0.01)  # Add autocorrelation at lag 45
        returns[::42] += np.random.normal(0, 0.01)  # Add autocorrelation at lag 42

        prices = 100 * np.cumprod(1 + returns)

        data = pd.DataFrame({
            'Close': prices,
            'returns': returns
        }, index=dates)

    # Initialize predictor
    predictor = UnbiasedPatternPredictor(confidence_level=0.95)

    # Discover parameters
    print("\n" + "-"*50)
    print("PHASE 1: DISCOVERING PARAMETERS FROM DATA")
    print("-"*50)

    discoveries = predictor.discover_parameters(data)

    if show_details:
        print("\n📊 Discovered Windows:")
        if discoveries['windows']['significant_windows']:
            for window in discoveries['windows']['significant_windows'][:5]:
                p_val = discoveries['windows']['corrected_pvalues'].get(window, 1)
                print(f"  Window {window}: p-value = {p_val:.6f}")
        else:
            print("  No statistically significant windows found")

        # Special check for 729
        if discoveries['windows']['special_729']:
            print("\n🎯 Window 729 Special Test:")
            for const, result in discoveries['windows']['special_729'].items():
                if result['converges']:
                    print(f"  Converges to {const}: p-value = {result['p_value']:.6f} ✓")

        print("\n📈 Discovered Lags:")
        if discoveries['lags']['significant_lags']:
            for lag in discoveries['lags']['significant_lags'][:5]:
                strength = discoveries['lags']['lag_strengths'].get(lag, 0)
                print(f"  Lag {lag}: strength = {strength:.6f}")
        else:
            print("  No significant autocorrelation found")

        print("\n🎯 Discovered Attractors:")
        if discoveries['attractors']['attractors']:
            for name, value in list(discoveries['attractors']['attractors'].items())[:5]:
                print(f"  {name}: {value:.6f}")
        else:
            print("  No clear attractors found")

        print("\n📐 Mathematical Constants Test:")
        significant_constants = []
        for name, result in discoveries['constants'].items():
            if result['is_significant']:
                significant_constants.append(name)
                print(f"  {name}: distance = {result['min_distance']:.6f}, p-value = {result['p_value']:.6f} ✓")

        if not significant_constants:
            print("  No mathematical constants significantly present")

    # Analyze with discoveries
    print("\n" + "-"*50)
    print("PHASE 2: ANALYZING PATTERNS")
    print("-"*50)

    analysis = predictor.analyze_with_discoveries(data, discoveries)

    if show_details:
        if 'window_analysis' in analysis and analysis['window_analysis']:
            print("\n📊 Window Analysis:")
            for window, results in list(analysis['window_analysis'].items())[:3]:
                print(f"  Window {window}:")
                print(f"    Density: {results['density']:.6f}")
                print(f"    Trend: {results['trend']:.6e}")
                print(f"    Volatility: {results['volatility']:.6f}")

        if 'lag_predictions' in analysis and analysis['lag_predictions']:
            print("\n🔮 Lag-Based Prediction:")
            pred = analysis['lag_predictions']
            print(f"  Next return prediction: {pred['prediction']:.6f}")
            print(f"  95% CI: [{pred['confidence_interval'][0]:.6f}, {pred['confidence_interval'][1]:.6f}]")
            print(f"  Cross-validation MSE: {pred['cv_score']:.6e}")

        if 'attractor_state' in analysis and analysis['attractor_state']:
            print("\n🎯 Attractor State:")
            state = analysis['attractor_state']
            print(f"  Nearest: {state['nearest_attractor']}")
            print(f"  Distance: {state['distance']:.6f}")
            print(f"  Converging: {state['converging']}")

    # Generate trading signals
    print("\n" + "-"*50)
    print("PHASE 3: GENERATING TRADING SIGNALS")
    print("-"*50)

    signals = predictor.generate_trading_signals(data, analysis)

    print(f"\n📈 Trading Signal:")
    print(f"  Signal: {signals['signals'][0]:.3f}")
    print(f"  Confidence: {signals['mean_confidence']:.1%}")
    print(f"  Recommendation: {signals['recommendation']}")

    # Backtest
    print("\n" + "-"*50)
    print("PHASE 4: BACKTESTING")
    print("-"*50)

    backtest_results = predictor.backtest(data)

    if 'error' not in backtest_results:
        print(f"\n📊 Backtest Results:")
        print(f"  Strategy Return: {backtest_results['strategy_return']:.2%}")
        print(f"  Buy & Hold Return: {backtest_results['buy_hold_return']:.2%}")
        print(f"  Strategy Sharpe: {backtest_results['strategy_sharpe']:.3f}")
        print(f"  Buy & Hold Sharpe: {backtest_results['buy_hold_sharpe']:.3f}")
        print(f"  Win Rate: {backtest_results['win_rate']:.1%}")
        # Check if statistical_significance key exists before accessing
        if 'statistical_significance' in backtest_results:
            print(f"  Statistical Significance: p = {backtest_results['statistical_significance']['p_value']:.6f}")

            if backtest_results['statistical_significance']['is_significant']:
                print("  ✓ Strategy is statistically different from buy & hold")
            else:
                print("  ✗ No statistical evidence of outperformance")
        else:
            print("  Statistical significance could not be calculated (insufficient backtest periods)")

    else:
        print(f"  Error: {backtest_results['error']}")

    # Summary
    print("\n" + "="*70)
    print("SUMMARY")
    print("="*70)

    # Count significant findings
    n_significant = sum([
        len(discoveries['windows']['significant_windows']) > 0,
        len(discoveries['lags']['significant_lags']) > 0,
        len(discoveries['attractors']['attractors']) > 0,
        any(r['is_significant'] for r in discoveries['constants'].values()),
        backtest_results.get('statistical_significance', {}).get('is_significant', False)
    ])

    print(f"\n🎯 Significant Findings: {n_significant}/5")

    if discoveries['windows']['special_729'] and any(r['converges'] for r in discoveries['windows']['special_729'].values()):
        print("✓ Window 729 shows special properties as hypothesized")

    if signals['mean_confidence'] > 0.5:
        print(f"✓ Confident trading signal generated ({signals['mean_confidence']:.1%})")
    else:
        print(f"⚠ Low confidence in trading signals ({signals['mean_confidence']:.1%})")

    if 'strategy_sharpe' in backtest_results and backtest_results['strategy_sharpe'] > 0:
        print(f"✓ Positive Sharpe ratio: {backtest_results['strategy_sharpe']:.3f}")

    print("\n" + "="*70)

    return {
        'discoveries': discoveries,
        'analysis': analysis,
        'signals': signals,
        'backtest': backtest_results,
        'summary': {
            'n_significant_findings': n_significant,
            'window_729_validated': discoveries['windows']['special_729'] is not None,
            'signal_confidence': signals['mean_confidence'],
            'backtest_success': backtest_results.get('strategy_sharpe', 0) > backtest_results.get('buy_hold_sharpe', 0)
        }
    }

# Run the analysis
if __name__ == "__main__":
    results = run_unbiased_analysis('SPY', '2020-01-01', '2024-10-31', show_details=True)


UNBIASED PATTERN DISCOVERY AND ANALYSIS

Fetching data for SPY...
✓ Loaded 1216 days of data

--------------------------------------------------
PHASE 1: DISCOVERING PARAMETERS FROM DATA
--------------------------------------------------
Discovering parameters from data...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()