In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# --- Configuration ---
# Adjust paths if necessary
RAW_BASELINE_FILE = Path("data/raw/Company20211201.dat")
ANALYSIS_DATA_FILE = Path("outputs/h2_analysis_dataset.csv")

# --- Helper Function (from features.py logic) ---
#
def compute_founder_credibility_simple(df: pd.DataFrame) -> pd.Series:
    """Simplified credibility check: assumes PrimaryContactPBId exists."""
    contact_id_col = 'PrimaryContactPBId'
    if contact_id_col not in df.columns:
        print(f"  ‚ö†Ô∏è Warning: '{contact_id_col}' not found. Cannot calculate serial founders.")
        return pd.Series(0, index=df.index)
    
    # Check if a contact ID appears more than once (serial)
    counts = df.groupby(contact_id_col)[contact_id_col].transform('size')
    # Treat NaN IDs as non-serial (count < 2)
    is_serial = (counts >= 2)
    return is_serial.astype(int)

# --- Debug Step 1: Check Raw Baseline Data ---
print("--- 1. Checking Raw Baseline Data ---")
try:
    # Load only necessary columns from the raw file
    raw_df = pd.read_csv(
        RAW_BASELINE_FILE,
        sep='|',
        low_memory=False,
        usecols=['CompanyID', 'PrimaryContactPBId', 'FirstFinancingDealType', 'CompanyFinancingStatus'] 
    )
    print(f"‚úì Raw baseline file loaded ({len(raw_df)} rows).")

    # Calculate 'founder_serial' directly on raw data
    raw_df['founder_serial_raw'] = compute_founder_credibility_simple(raw_df)
    
    raw_serial_count = raw_df['founder_serial_raw'].sum()
    print(f"  FOUNDER_SERIAL == 1 in RAW data: {raw_serial_count} companies")
    
    if raw_serial_count == 0:
        print("  üö® DIAGNOSIS: No serial founders found in the original baseline data.")
        
except FileNotFoundError:
    print(f"  ‚ùå ERROR: Raw data file not found at {RAW_BASELINE_FILE}")
    raw_df = None # Ensure variable exists
except Exception as e:
    print(f"  ‚ùå ERROR loading or processing raw data: {e}")
    raw_df = None

# --- Debug Step 2: Check Filtered H3 Data ---
print("\n--- 2. Checking Filtered H3 Data ('Early Stage VC') ---")
if raw_df is not None and raw_serial_count > 0:
    # Apply the H1/H3 filter: Early Stage VC only
    #
    h3_filtered_df = raw_df[
        raw_df['FirstFinancingDealType'].str.contains('Early Stage VC', case=False, na=False) &
        (raw_df['CompanyFinancingStatus'] == 'Venture Capital-Backed')
    ].copy() 
    
    h3_serial_count = h3_filtered_df['founder_serial_raw'].sum()
    print(f"  Total companies matching H3 filter ('Early Stage VC'): {len(h3_filtered_df)}")
    print(f"  FOUNDER_SERIAL == 1 AFTER H3 filter: {h3_serial_count} companies")

    if h3_serial_count == 0:
        print("  üö® DIAGNOSIS: Serial founders exist in raw data, but NONE had 'Early Stage VC' as FirstFinancingDealType.")
elif raw_df is not None:
     print("  Skipping H3 check because raw data had no serial founders.")
else:
     print("  Skipping H3 check due to errors loading raw data.")

# --- Debug Step 3: Check Final Analysis Data (used for H4) ---
print("\n--- 3. Checking Final Analysis Data (used for H4) ---")
try:
    analysis_df = pd.read_csv(ANALYSIS_DATA_FILE)
    print(f"‚úì Final analysis file loaded ({len(analysis_df)} rows).")

    # Check 'founder_serial' column in the final dataset
    # This column should have been created by run_analysis.py / preprocess_for_w1
    if 'founder_serial' in analysis_df.columns:
        analysis_serial_count = analysis_df['founder_serial'].sum()
        print(f"  FOUNDER_SERIAL == 1 in FINAL analysis data: {analysis_serial_count} companies")
        
        if analysis_serial_count == 0 and raw_serial_count > 0 and h3_serial_count > 0:
             print("  üö® DIAGNOSIS: Serial founders exist and had Early Stage VC deals, but were lost during H2/H4 cohort definition or preprocessing steps.")
        elif analysis_serial_count == 0:
             print("  Confirming: No serial founders present in the final dataset used for H4 plot.")
             
    else:
        # If 'founder_serial' is missing, try recreating it from 'founder_credibility'
        if 'founder_credibility' in analysis_df.columns:
             analysis_df['founder_serial_recreated'] = (analysis_df['founder_credibility'] > 0).astype(int)
             analysis_serial_count_recreated = analysis_df['founder_serial_recreated'].sum()
             print(f"  'founder_serial' column missing. Recreated from 'founder_credibility'.")
             print(f"  FOUNDER_SERIAL == 1 (recreated) in FINAL analysis data: {analysis_serial_count_recreated} companies")
             if analysis_serial_count_recreated == 0:
                 print("  Confirming: No serial founders present based on 'founder_credibility'.")
        else:
             print("  ‚ö†Ô∏è ERROR: Neither 'founder_serial' nor 'founder_credibility' found in the final analysis data.")

except FileNotFoundError:
    print(f"  ‚ùå ERROR: Final analysis data file not found at {ANALYSIS_DATA_FILE}")
except Exception as e:
    print(f"  ‚ùå ERROR loading or processing final analysis data: {e}")

--- Patching 'df' with missing 'founder_serial' column ---
  ‚ùå ERROR during patch: name 'df' is not defined
