# Test & Debug Notebook

Run this notebook first to:
1. Verify all dependencies are installed
2. Test data loading
3. Check for any errors
4. Validate the analysis pipeline

## 1. Test Library Imports

In [None]:
print("Testing library imports...\n")

errors = []

# Test each library
try:
    import pandas as pd
    print(f"✓ pandas {pd.__version__}")
except Exception as e:
    print(f"✗ pandas FAILED: {e}")
    errors.append(('pandas', e))

try:
    import numpy as np
    print(f"✓ numpy {np.__version__}")
except Exception as e:
    print(f"✗ numpy FAILED: {e}")
    errors.append(('numpy', e))

try:
    import matplotlib.pyplot as plt
    import matplotlib
    print(f"✓ matplotlib {matplotlib.__version__}")
except Exception as e:
    print(f"✗ matplotlib FAILED: {e}")
    errors.append(('matplotlib', e))

try:
    import seaborn as sns
    print(f"✓ seaborn {sns.__version__}")
except Exception as e:
    print(f"✗ seaborn FAILED: {e}")
    errors.append(('seaborn', e))

try:
    from scipy import stats
    import scipy
    print(f"✓ scipy {scipy.__version__}")
except Exception as e:
    print(f"✗ scipy FAILED: {e}")
    errors.append(('scipy', e))

try:
    from sklearn.linear_model import LinearRegression
    import sklearn
    print(f"✓ scikit-learn {sklearn.__version__}")
except Exception as e:
    print(f"✗ scikit-learn FAILED: {e}")
    errors.append(('scikit-learn', e))

if errors:
    print(f"\n\n⚠ {len(errors)} LIBRARIES FAILED TO IMPORT")
    print("\nInstall missing libraries with:")
    print("pip install -r requirements.txt")
else:
    print("\n✓ All libraries imported successfully!")

## 2. Test Data Loading

In [None]:
print("Testing data loading...\n")

try:
    # Test loading expanded dataset
    df = pd.read_csv('../data/raw/expanded_balkan_wage_data.csv')
    print(f"✓ Expanded dataset loaded")
    print(f"  Records: {len(df)}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Countries: {df['country'].nunique()}")
    print(f"  Years: {df['year'].min()} - {df['year'].max()}")
    
    # Check for issues
    missing = df.isnull().sum().sum()
    duplicates = df.duplicated().sum()
    
    if missing > 0:
        print(f"\n  ⚠ Warning: {missing} missing values")
    else:
        print(f"\n  ✓ No missing values")
        
    if duplicates > 0:
        print(f"  ⚠ Warning: {duplicates} duplicate rows")
    else:
        print(f"  ✓ No duplicates")
        
except FileNotFoundError:
    print("✗ ERROR: expanded_balkan_wage_data.csv not found")
    print("  Check that the file exists in data/raw/")
except Exception as e:
    print(f"✗ ERROR loading data: {e}")

In [None]:
# Test loading original sample dataset
try:
    df_sample = pd.read_csv('../data/raw/macedonia_wage_sample.csv')
    print(f"✓ Original sample dataset loaded")
    print(f"  Records: {len(df_sample)}")
except FileNotFoundError:
    print("⚠ Original sample file not found (this is OK if using expanded data)")
except Exception as e:
    print(f"⚠ Error loading sample: {e}")

## 3. Test Basic Operations

In [None]:
print("Testing basic data operations...\n")

try:
    # Test filtering
    nm_data = df[df['country'] == 'North Macedonia']
    print(f"✓ Filtering: {len(nm_data)} North Macedonia records")
    
    # Test grouping
    gender_avg = df.groupby('gender')['avg_monthly_wage'].mean()
    print(f"✓ Grouping: Female avg = {gender_avg['Female']:.2f}, Male avg = {gender_avg['Male']:.2f}")
    
    # Test calculations
    gap = ((gender_avg['Male'] - gender_avg['Female']) / gender_avg['Male'] * 100)
    print(f"✓ Calculations: Overall gap = {gap:.2f}%")
    
    print("\n✓ All basic operations working correctly")
    
except Exception as e:
    print(f"\n✗ ERROR in basic operations: {e}")
    import traceback
    traceback.print_exc()

## 4. Test Statistical Functions

In [None]:
print("Testing statistical functions...\n")

try:
    female_wages = df[df['gender'] == 'Female']['avg_monthly_wage']
    male_wages = df[df['gender'] == 'Male']['avg_monthly_wage']
    
    # T-test
    t_stat, p_value = stats.ttest_ind(male_wages, female_wages)
    print(f"✓ T-test: t={t_stat:.4f}, p={p_value:.6f}")
    
    # Mann-Whitney U
    u_stat, u_pvalue = stats.mannwhitneyu(male_wages, female_wages)
    print(f"✓ Mann-Whitney U: U={u_stat:.4f}, p={u_pvalue:.6f}")
    
    # Linear regression
    from sklearn.linear_model import LinearRegression
    yearly = df.groupby(['year', 'gender'])['avg_monthly_wage'].mean().unstack()
    if 'Female' in yearly.columns and 'Male' in yearly.columns:
        yearly['gap_%'] = ((yearly['Male'] - yearly['Female']) / yearly['Male'] * 100)
        X = yearly.index.values.reshape(-1, 1)
        y = yearly['gap_%'].values
        model = LinearRegression()
        model.fit(X, y)
        print(f"✓ Linear Regression: slope={model.coef_[0]:.4f}")
    
    print("\n✓ All statistical functions working correctly")
    
except Exception as e:
    print(f"\n✗ ERROR in statistical functions: {e}")
    import traceback
    traceback.print_exc()

## 5. Test Visualization

In [None]:
print("Testing visualization...\n")

try:
    # Test matplotlib
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    
    gender_avg = df.groupby('gender')['avg_monthly_wage'].mean()
    ax.bar(gender_avg.index, gender_avg.values, color=['#FF6B6B', '#4ECDC4'])
    ax.set_title('Test Plot: Average Wages by Gender')
    ax.set_ylabel('Average Monthly Wage')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Matplotlib visualization working")
    
except Exception as e:
    print(f"✗ ERROR in visualization: {e}")
    import traceback
    traceback.print_exc()

## 6. Test Seaborn Visualization

In [None]:
print("Testing seaborn...\n")

try:
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    
    # Create a simple boxplot
    sns.boxplot(data=df, x='country', y='avg_monthly_wage', hue='gender', ax=ax)
    ax.set_title('Test Plot: Wage Distribution by Country and Gender')
    ax.set_xlabel('Country')
    ax.set_ylabel('Average Monthly Wage')
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Seaborn visualization working")
    
except Exception as e:
    print(f"⚠ Seaborn warning: {e}")
    print("  This may be a style issue - visualizations should still work")

## 7. Summary & Recommendations

In [None]:
print("="*80)
print("TEST SUMMARY")
print("="*80)
print("\nIf all tests above passed, you're ready to run the main analysis!")
print("\nNext steps:")
print("1. Open notebooks/05_comprehensive_analysis.ipynb")
print("2. Run all cells (Cell > Run All)")
print("3. Review outputs and visualizations")
print("\nIf you encountered errors:")
print("1. Check that all libraries are installed: pip install -r requirements.txt")
print("2. Verify data files exist in data/raw/")
print("3. Check Python version (requires 3.8+)")
print("="*80)