In [1]:
"""
v3.3 vs v3.4 Results Comparison Tool
Analyzes improvements and validates success metrics
"""

import pandas as pd
import numpy as np

# v3.3 Baseline Results
v33_results = {
    'XLK': {'test_accuracy': 0.569, 'gap': 0.273, 'wf_std': 0.082},
    'XLF': {'test_accuracy': 0.541, 'gap': 0.264, 'wf_std': 0.081},
    'XLE': {'test_accuracy': 0.624, 'gap': 0.231, 'wf_std': 0.076},
    'XLV': {'test_accuracy': 0.627, 'gap': 0.209, 'wf_std': 0.025},
    'XLI': {'test_accuracy': 0.462, 'gap': 0.367, 'wf_std': 0.072},
    'XLY': {'test_accuracy': 0.544, 'gap': 0.324, 'wf_std': 0.083},
    'XLP': {'test_accuracy': 0.676, 'gap': 0.141, 'wf_std': 0.108},
    'XLU': {'test_accuracy': 0.483, 'gap': 0.376, 'wf_std': 0.058},
    'XLRE': {'test_accuracy': 0.657, 'gap': 0.178, 'wf_std': 0.100},
    'XLC': {'test_accuracy': 0.495, 'gap': 0.305, 'wf_std': 0.063},
    'XLB': {'test_accuracy': 0.657, 'gap': 0.147, 'wf_std': 0.042},
}

def analyze_results(v34_results: dict):
    """
    Compare v3.3 baseline to v3.4 results.
    
    Args:
        v34_results: Dict with same structure as v33_results
    """
    print("\n" + "="*80)
    print("üî¨ v3.3 vs v3.4 COMPREHENSIVE ANALYSIS")
    print("="*80)
    
    # Convert to DataFrames
    df_v33 = pd.DataFrame(v33_results).T
    df_v34 = pd.DataFrame(v34_results).T
    
    # Calculate deltas
    delta_acc = df_v34['test_accuracy'] - df_v33['test_accuracy']
    delta_gap = df_v34['gap'] - df_v33['gap']
    delta_std = df_v34['wf_std'] - df_v33['wf_std']
    
    # Create comparison DataFrame
    comparison = pd.DataFrame({
        'v3.3_Acc': df_v33['test_accuracy'],
        'v3.4_Acc': df_v34['test_accuracy'],
        'Œî_Acc': delta_acc,
        'v3.3_Gap': df_v33['gap'],
        'v3.4_Gap': df_v34['gap'],
        'Œî_Gap': delta_gap,
        'v3.3_Std': df_v33['wf_std'],
        'v3.4_Std': df_v34['wf_std'],
        'Œî_Std': delta_std,
    })
    
    # 1. PRIMARY GOAL: Gap Reduction Analysis
    print("\n" + "="*80)
    print("üéØ PRIMARY GOAL: GAP REDUCTION TO <0.25")
    print("="*80)
    
    v33_under_25 = (df_v33['gap'] < 0.25).sum()
    v34_under_25 = (df_v34['gap'] < 0.25).sum()
    
    print(f"\nüìä Overall Progress:")
    print(f"   v3.3: {v33_under_25}/11 sectors with gap <0.25")
    print(f"   v3.4: {v34_under_25}/11 sectors with gap <0.25")
    print(f"   Goal: 7+ sectors")
    
    if v34_under_25 >= 7:
        print(f"   ‚úÖ SUCCESS! Achieved {v34_under_25} sectors (goal: 7+)")
    elif v34_under_25 > v33_under_25:
        print(f"   üü° PROGRESS! {v34_under_25 - v33_under_25} more sectors, but goal not met")
    else:
        print(f"   ‚ùå REGRESSION: Lost {v33_under_25 - v34_under_25} sectors")
    
    # 2. PROBLEM CHILDREN ANALYSIS
    print("\n" + "="*80)
    print("‚ö†Ô∏è  PROBLEM CHILDREN (v3.3 Gap >0.30)")
    print("="*80)
    
    problem_children = ['XLU', 'XLI', 'XLY', 'XLC']
    
    print(f"\n{'Sector':<8} {'v3.3 Gap':>10} {'v3.4 Gap':>10} {'Œî Gap':>10} {'Status':<20}")
    print("-" * 70)
    
    improvements = 0
    regressions = 0
    
    for sector in problem_children:
        v33_gap = v33_results[sector]['gap']
        v34_gap = v34_results[sector]['gap']
        delta = v34_gap - v33_gap
        
        if delta < -0.05:
            status = "‚úÖ Major improvement"
            improvements += 1
        elif delta < 0:
            status = "üü¢ Improved"
            improvements += 1
        elif delta == 0:
            status = "üü° No change"
        else:
            status = "‚ùå Regressed"
            regressions += 1
        
        print(f"{sector:<8} {v33_gap:>10.3f} {v34_gap:>10.3f} {delta:>10.3f} {status:<20}")
    
    print(f"\nüìä Summary:")
    print(f"   Improved: {improvements}/4")
    print(f"   Regressed: {regressions}/4")
    
    if improvements >= 3:
        print("   ‚úÖ Strong success on problem children")
    elif improvements >= 2:
        print("   üü° Partial success on problem children")
    else:
        print("   ‚ùå Strategy did not work for problem children")
    
    # 3. WINNERS PRESERVATION
    print("\n" + "="*80)
    print("üèÜ WINNERS PRESERVATION (v3.3 Gap <0.20)")
    print("="*80)
    
    winners = ['XLP', 'XLRE', 'XLB']
    
    print(f"\n{'Sector':<8} {'v3.3 Gap':>10} {'v3.4 Gap':>10} {'Œî Gap':>10} {'Status':<20}")
    print("-" * 70)
    
    preserved = 0
    
    for sector in winners:
        v33_gap = v33_results[sector]['gap']
        v34_gap = v34_results[sector]['gap']
        delta = v34_gap - v33_gap
        
        if v34_gap < 0.20:
            status = "‚úÖ Still excellent"
            preserved += 1
        elif v34_gap < 0.25:
            status = "üü° Still good"
            preserved += 1
        else:
            status = "‚ùå Lost excellence"
        
        print(f"{sector:<8} {v33_gap:>10.3f} {v34_gap:>10.3f} {delta:>10.3f} {status:<20}")
    
    print(f"\nüìä Summary:")
    print(f"   Preserved: {preserved}/3")
    
    if preserved == 3:
        print("   ‚úÖ All winners preserved")
    elif preserved >= 2:
        print("   üü° Mostly preserved")
    else:
        print("   ‚ùå Winners damaged")
    
    # 4. NEAR-MISSES ANALYSIS
    print("\n" + "="*80)
    print("üéØ NEAR-MISSES (v3.3 Gap 0.25-0.30)")
    print("="*80)
    
    near_misses = ['XLK', 'XLF']
    
    print(f"\n{'Sector':<8} {'v3.3 Gap':>10} {'v3.4 Gap':>10} {'Œî Gap':>10} {'Status':<20}")
    print("-" * 70)
    
    conversions = 0
    
    for sector in near_misses:
        v33_gap = v33_results[sector]['gap']
        v34_gap = v34_results[sector]['gap']
        delta = v34_gap - v33_gap
        
        if v34_gap < 0.25:
            status = "‚úÖ CONVERTED!"
            conversions += 1
        elif delta < 0:
            status = "üü¢ Improved"
        else:
            status = "üü° No progress"
        
        print(f"{sector:<8} {v33_gap:>10.3f} {v34_gap:>10.3f} {delta:>10.3f} {status:<20}")
    
    print(f"\nüìä Summary:")
    print(f"   Converted: {conversions}/2")
    
    # 5. FULL COMPARISON TABLE
    print("\n" + "="*80)
    print("üìä FULL COMPARISON TABLE")
    print("="*80)
    print("\n" + comparison.to_string())
    
    # 6. STATISTICAL SUMMARY
    print("\n" + "="*80)
    print("üìà STATISTICAL SUMMARY")
    print("="*80)
    
    print(f"\nAccuracy Changes:")
    print(f"   Mean Œî: {delta_acc.mean():.3f}")
    print(f"   Median Œî: {delta_acc.median():.3f}")
    print(f"   Improved: {(delta_acc > 0).sum()}/11")
    print(f"   Regressed: {(delta_acc < 0).sum()}/11")
    
    print(f"\nGap Changes:")
    print(f"   Mean Œî: {delta_gap.mean():.3f}")
    print(f"   Median Œî: {delta_gap.median():.3f}")
    print(f"   Improved: {(delta_gap < 0).sum()}/11")
    print(f"   Regressed: {(delta_gap > 0).sum()}/11")
    
    print(f"\nStability Changes:")
    print(f"   Mean Œî: {delta_std.mean():.3f}")
    print(f"   Median Œî: {delta_std.median():.3f}")
    print(f"   Improved: {(delta_std < 0).sum()}/11")
    print(f"   Regressed: {(delta_std > 0).sum()}/11")
    
    # 7. FINAL VERDICT
    print("\n" + "="*80)
    print("üèÅ FINAL VERDICT")
    print("="*80)
    
    score = 0
    max_score = 5
    
    # Scoring criteria
    if v34_under_25 >= 7:
        score += 2
        print("\n‚úÖ Primary goal achieved (7+ sectors <0.25): +2 points")
    elif v34_under_25 > v33_under_25:
        score += 1
        print("\nüü° Progress toward goal: +1 point")
    
    if improvements >= 3:
        score += 1
        print("‚úÖ Problem children improved (3+/4): +1 point")
    elif improvements >= 2:
        score += 0.5
        print("üü° Some problem children improved (2/4): +0.5 point")
    
    if preserved == 3:
        score += 1
        print("‚úÖ All winners preserved: +1 point")
    elif preserved >= 2:
        score += 0.5
        print("üü° Most winners preserved: +0.5 point")
    
    if conversions >= 1:
        score += 1
        print("‚úÖ Near-misses converted (1+/2): +1 point")
    
    print(f"\nüéØ Final Score: {score:.1f}/{max_score}")
    
    if score >= 4:
        verdict = "üèÜ OUTSTANDING SUCCESS - Deploy v3.4"
    elif score >= 3:
        verdict = "‚úÖ SOLID SUCCESS - v3.4 is an improvement"
    elif score >= 2:
        verdict = "üü° MIXED RESULTS - Some gains, some losses"
    else:
        verdict = "‚ùå REGRESSION - Stick with v3.3"
    
    print(f"\n{verdict}")
    
    return comparison


# Example usage:
if __name__ == "__main__":
    # INSERT YOUR v3.4 RESULTS HERE
    v34_results_example = {
        'XLK': {'test_accuracy': 0.580, 'gap': 0.250, 'wf_std': 0.075},
        'XLF': {'test_accuracy': 0.555, 'gap': 0.240, 'wf_std': 0.070},
        'XLE': {'test_accuracy': 0.620, 'gap': 0.230, 'wf_std': 0.080},
        'XLV': {'test_accuracy': 0.625, 'gap': 0.210, 'wf_std': 0.030},
        'XLI': {'test_accuracy': 0.480, 'gap': 0.300, 'wf_std': 0.065},
        'XLY': {'test_accuracy': 0.560, 'gap': 0.280, 'wf_std': 0.075},
        'XLP': {'test_accuracy': 0.670, 'gap': 0.150, 'wf_std': 0.110},
        'XLU': {'test_accuracy': 0.500, 'gap': 0.290, 'wf_std': 0.055},
        'XLRE': {'test_accuracy': 0.650, 'gap': 0.185, 'wf_std': 0.095},
        'XLC': {'test_accuracy': 0.510, 'gap': 0.285, 'wf_std': 0.060},
        'XLB': {'test_accuracy': 0.650, 'gap': 0.155, 'wf_std': 0.045},
    }
    
    print("\n‚ö†Ô∏è  EXAMPLE ANALYSIS (Replace with your actual v3.4 results)")
    comparison = analyze_results(v34_results_example)


‚ö†Ô∏è  EXAMPLE ANALYSIS (Replace with your actual v3.4 results)

üî¨ v3.3 vs v3.4 COMPREHENSIVE ANALYSIS

üéØ PRIMARY GOAL: GAP REDUCTION TO <0.25

üìä Overall Progress:
   v3.3: 5/11 sectors with gap <0.25
   v3.4: 6/11 sectors with gap <0.25
   Goal: 7+ sectors
   üü° PROGRESS! 1 more sectors, but goal not met

‚ö†Ô∏è  PROBLEM CHILDREN (v3.3 Gap >0.30)

Sector     v3.3 Gap   v3.4 Gap      Œî Gap Status              
----------------------------------------------------------------------
XLU           0.376      0.290     -0.086 ‚úÖ Major improvement 
XLI           0.367      0.300     -0.067 ‚úÖ Major improvement 
XLY           0.324      0.280     -0.044 üü¢ Improved          
XLC           0.305      0.285     -0.020 üü¢ Improved          

üìä Summary:
   Improved: 4/4
   Regressed: 0/4
   ‚úÖ Strong success on problem children

üèÜ WINNERS PRESERVATION (v3.3 Gap <0.20)

Sector     v3.3 Gap   v3.4 Gap      Œî Gap Status              
--------------------------------------