# Advanced Gender Wage Gap Analysis

This notebook performs advanced statistical analysis on the cleaned wage data.

## Analysis Objectives
1. Statistical significance testing
2. Regional comparisons
3. Time series analysis
4. Factor analysis (education, sector, age)
5. Predictive insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

## 1. Load Cleaned Data

In [None]:
# Load cleaned data
df = pd.read_csv('../data/cleaned/macedonia_wage_cleaned.csv')
print(f"Loaded {len(df)} records")
print(f"Date range: {df['year'].min()} - {df['year'].max()}")
print(f"Countries: {df['country'].unique()}")
df.head()

## 2. Statistical Significance Testing

In [None]:
# T-test: Is the wage difference between genders statistically significant?

male_wages = df[df['gender'] == 'Male']['avg_monthly_wage']
female_wages = df[df['gender'] == 'Female']['avg_monthly_wage']

# Perform independent t-test
t_stat, p_value = stats.ttest_ind(male_wages, female_wages)

print("=" * 60)
print("GENDER WAGE GAP STATISTICAL SIGNIFICANCE TEST")
print("=" * 60)
print(f"\nMale wages:")
print(f"  Mean: {male_wages.mean():.2f} MKD")
print(f"  Std Dev: {male_wages.std():.2f}")
print(f"  Sample size: {len(male_wages)}")

print(f"\nFemale wages:")
print(f"  Mean: {female_wages.mean():.2f} MKD")
print(f"  Std Dev: {female_wages.std():.2f}")
print(f"  Sample size: {len(female_wages)}")

print(f"\nWage difference: {(male_wages.mean() - female_wages.mean()):.2f} MKD")
print(f"Percentage gap: {((male_wages.mean() - female_wages.mean()) / male_wages.mean() * 100):.2f}%")

print(f"\nT-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.6f}")

if p_value < 0.05:
    print(f"\n✓ Result: STATISTICALLY SIGNIFICANT (p < 0.05)")
    print(f"  The wage gap between genders is statistically significant.")
else:
    print(f"\n✗ Result: NOT STATISTICALLY SIGNIFICANT (p >= 0.05)")
    print(f"  Cannot conclude a significant wage gap exists.")

## 3. Country-by-Country Analysis

In [None]:
# Statistical tests for each country
print("\n" + "=" * 60)
print("COUNTRY-SPECIFIC WAGE GAP ANALYSIS")
print("=" * 60)

results = []

for country in df['country'].unique():
    country_data = df[df['country'] == country]
    male = country_data[country_data['gender'] == 'Male']['avg_monthly_wage']
    female = country_data[country_data['gender'] == 'Female']['avg_monthly_wage']
    
    if len(male) > 1 and len(female) > 1:
        t_stat, p_val = stats.ttest_ind(male, female)
        gap_percent = ((male.mean() - female.mean()) / male.mean() * 100)
        
        results.append({
            'Country': country,
            'Male Avg': male.mean(),
            'Female Avg': female.mean(),
            'Gap (MKD)': male.mean() - female.mean(),
            'Gap (%)': gap_percent,
            'P-value': p_val,
            'Significant': 'Yes' if p_val < 0.05 else 'No'
        })

results_df = pd.DataFrame(results).sort_values('Gap (%)', ascending=False)
print("\n", results_df.to_string(index=False))

In [None]:
# Visualize country comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Average wages by country and gender
country_gender = df.groupby(['country', 'gender'])['avg_monthly_wage'].mean().unstack()
country_gender.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#3498db'])
axes[0].set_title('Average Wages by Country and Gender', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Country', fontsize=12)
axes[0].set_ylabel('Average Monthly Wage (MKD)', fontsize=12)
axes[0].legend(title='Gender')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Wage gap percentage by country
axes[1].barh(results_df['Country'], results_df['Gap (%)'], 
             color=['red' if x == 'Yes' else 'orange' for x in results_df['Significant']])
axes[1].set_title('Gender Wage Gap by Country', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Wage Gap (%)', fontsize=12)
axes[1].set_ylabel('Country', fontsize=12)
axes[1].grid(axis='x', alpha=0.3)
axes[1].axvline(x=results_df['Gap (%)'].mean(), color='black', linestyle='--', 
                label=f"Average: {results_df['Gap (%)'].mean():.1f}%")
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Education Level Impact

In [None]:
# Analyze wage gap by education level
print("\n" + "=" * 60)
print("EDUCATION LEVEL IMPACT ON WAGE GAP")
print("=" * 60)

edu_results = []

for edu in df['education_level'].unique():
    edu_data = df[df['education_level'] == edu]
    male = edu_data[edu_data['gender'] == 'Male']['avg_monthly_wage']
    female = edu_data[edu_data['gender'] == 'Female']['avg_monthly_wage']
    
    if len(male) > 0 and len(female) > 0:
        gap_percent = ((male.mean() - female.mean()) / male.mean() * 100)
        
        edu_results.append({
            'Education Level': edu,
            'Male Avg': male.mean(),
            'Female Avg': female.mean(),
            'Gap (%)': gap_percent,
            'Sample Size': len(edu_data)
        })

edu_df = pd.DataFrame(edu_results).sort_values('Gap (%)', ascending=False)
print("\n", edu_df.to_string(index=False))

In [None]:
# Visualize education impact
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Wages by education and gender
edu_gender = df.groupby(['education_level', 'gender'])['avg_monthly_wage'].mean().unstack()
edu_gender.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#3498db'], width=0.7)
axes[0].set_title('Average Wages by Education Level and Gender', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Education Level', fontsize=12)
axes[0].set_ylabel('Average Monthly Wage (MKD)', fontsize=12)
axes[0].legend(title='Gender')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Gap percentage by education
axes[1].bar(edu_df['Education Level'], edu_df['Gap (%)'], color='coral')
axes[1].set_title('Wage Gap by Education Level', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Education Level', fontsize=12)
axes[1].set_ylabel('Wage Gap (%)', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Sector Analysis (Public vs Private)

In [None]:
# Compare public vs private sector
print("\n" + "=" * 60)
print("SECTOR ANALYSIS: PUBLIC vs PRIVATE")
print("=" * 60)

for sector in df['sector'].unique():
    print(f"\n{sector.upper()} SECTOR:")
    sector_data = df[df['sector'] == sector]
    male = sector_data[sector_data['gender'] == 'Male']['avg_monthly_wage']
    female = sector_data[sector_data['gender'] == 'Female']['avg_monthly_wage']
    
    gap_percent = ((male.mean() - female.mean()) / male.mean() * 100)
    
    print(f"  Male average: {male.mean():.2f} MKD")
    print(f"  Female average: {female.mean():.2f} MKD")
    print(f"  Wage gap: {gap_percent:.2f}%")
    print(f"  Sample size: {len(sector_data)}")

In [None]:
# Sector comparison visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: Sector wages by gender
sector_gender = df.groupby(['sector', 'gender'])['avg_monthly_wage'].mean().unstack()
sector_gender.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#3498db'], width=0.6)
axes[0].set_title('Average Wages by Sector and Gender', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sector', fontsize=12)
axes[0].set_ylabel('Average Monthly Wage (MKD)', fontsize=12)
axes[0].legend(title='Gender')
axes[0].tick_params(axis='x', rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Plot 2: Distribution of wages by sector
for sector in df['sector'].unique():
    sector_data = df[df['sector'] == sector]
    axes[1].hist([sector_data[sector_data['gender'] == 'Male']['avg_monthly_wage'],
                   sector_data[sector_data['gender'] == 'Female']['avg_monthly_wage']],
                  label=[f'{sector} - Male', f'{sector} - Female'],
                  alpha=0.6, bins=10)

axes[1].set_title('Wage Distribution by Sector and Gender', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Average Monthly Wage (MKD)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Time Series Analysis

In [None]:
# Analyze trends over time
yearly_gap = []

for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year]
    male = year_data[year_data['gender'] == 'Male']['avg_monthly_wage'].mean()
    female = year_data[year_data['gender'] == 'Female']['avg_monthly_wage'].mean()
    gap = ((male - female) / male * 100)
    
    yearly_gap.append({
        'Year': year,
        'Male Avg': male,
        'Female Avg': female,
        'Gap (%)': gap
    })

yearly_df = pd.DataFrame(yearly_gap)
print("\nYearly Wage Gap Trends:")
print(yearly_df.to_string(index=False))

In [None]:
# Time series visualization
fig, axes = plt.subplots(2, 1, figsize=(12, 10))

# Plot 1: Wage trends over time
axes[0].plot(yearly_df['Year'], yearly_df['Male Avg'], marker='o', label='Male', linewidth=2, markersize=8)
axes[0].plot(yearly_df['Year'], yearly_df['Female Avg'], marker='s', label='Female', linewidth=2, markersize=8)
axes[0].set_title('Average Wage Trends by Gender (2020-2023)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Year', fontsize=12)
axes[0].set_ylabel('Average Monthly Wage (MKD)', fontsize=12)
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(yearly_df['Year'])

# Plot 2: Gap percentage trend
axes[1].plot(yearly_df['Year'], yearly_df['Gap (%)'], marker='D', color='coral', 
             linewidth=2, markersize=8)
axes[1].set_title('Gender Wage Gap Trend (2020-2023)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Year', fontsize=12)
axes[1].set_ylabel('Wage Gap (%)', fontsize=12)
axes[1].grid(True, alpha=0.3)
axes[1].set_xticks(yearly_df['Year'])
axes[1].axhline(y=yearly_df['Gap (%)'].mean(), color='red', linestyle='--', 
                label=f"Average: {yearly_df['Gap (%)'].mean():.2f}%")
axes[1].legend()

plt.tight_layout()
plt.show()

## 7. North Macedonia Deep Dive

In [None]:
# Focus on North Macedonia
macedonia = df[df['country'] == 'North Macedonia'].copy()

print("\n" + "=" * 60)
print("NORTH MACEDONIA DETAILED ANALYSIS")
print("=" * 60)
print(f"\nTotal records: {len(macedonia)}")
print(f"Years covered: {macedonia['year'].min()} - {macedonia['year'].max()}")

# Overall statistics
male_mk = macedonia[macedonia['gender'] == 'Male']['avg_monthly_wage']
female_mk = macedonia[macedonia['gender'] == 'Female']['avg_monthly_wage']

print(f"\nOverall Statistics:")
print(f"  Male average: {male_mk.mean():.2f} MKD")
print(f"  Female average: {female_mk.mean():.2f} MKD")
print(f"  Absolute gap: {(male_mk.mean() - female_mk.mean()):.2f} MKD")
print(f"  Percentage gap: {((male_mk.mean() - female_mk.mean()) / male_mk.mean() * 100):.2f}%")

# By sector in North Macedonia
print(f"\nBy Sector:")
for sector in macedonia['sector'].unique():
    sector_data = macedonia[macedonia['sector'] == sector]
    male = sector_data[sector_data['gender'] == 'Male']['avg_monthly_wage'].mean()
    female = sector_data[sector_data['gender'] == 'Female']['avg_monthly_wage'].mean()
    gap = ((male - female) / male * 100)
    print(f"  {sector}: {gap:.2f}% gap")

# By education in North Macedonia
print(f"\nBy Education:")
for edu in macedonia['education_level'].unique():
    edu_data = macedonia[macedonia['education_level'] == edu]
    male = edu_data[edu_data['gender'] == 'Male']['avg_monthly_wage'].mean()
    female = edu_data[edu_data['gender'] == 'Female']['avg_monthly_wage'].mean()
    gap = ((male - female) / male * 100)
    print(f"  {edu}: {gap:.2f}% gap")

In [None]:
# North Macedonia heatmap
macedonia_pivot = macedonia.pivot_table(
    values='avg_monthly_wage',
    index=['sector', 'education_level'],
    columns='gender',
    aggfunc='mean'
)

macedonia_pivot['Gap (%)'] = ((macedonia_pivot['Male'] - macedonia_pivot['Female']) / 
                                macedonia_pivot['Male'] * 100)

plt.figure(figsize=(10, 6))
sns.heatmap(macedonia_pivot[['Gap (%)']], annot=True, fmt='.1f', cmap='RdYlGn_r', 
            cbar_kws={'label': 'Wage Gap (%)'})
plt.title('North Macedonia: Wage Gap by Sector and Education', fontsize=14, fontweight='bold')
plt.ylabel('Sector / Education Level', fontsize=12)
plt.tight_layout()
plt.show()

## 8. Key Insights Summary

In [None]:
print("\n" + "=" * 70)
print("KEY INSIGHTS FROM ANALYSIS")
print("=" * 70)

print("\n1. STATISTICAL SIGNIFICANCE:")
print(f"   - Gender wage gap is statistically significant (p < 0.05)")
print(f"   - Average gap across all countries: {results_df['Gap (%)'].mean():.2f}%")

print("\n2. COUNTRY VARIATIONS:")
print(f"   - Highest gap: {results_df.iloc[0]['Country']} ({results_df.iloc[0]['Gap (%)']:.2f}%)")
print(f"   - Lowest gap: {results_df.iloc[-1]['Country']} ({results_df.iloc[-1]['Gap (%)']:.2f}%)")

print("\n3. EDUCATION IMPACT:")
print(f"   - University educated gap: {edu_df[edu_df['Education Level'] == 'University']['Gap (%)'].values[0]:.2f}%")
print(f"   - High school gap: {edu_df[edu_df['Education Level'] == 'High School']['Gap (%)'].values[0]:.2f}%")

print("\n4. SECTOR DIFFERENCES:")
sector_gaps = df.groupby('sector').apply(
    lambda x: ((x[x['gender'] == 'Male']['avg_monthly_wage'].mean() - 
               x[x['gender'] == 'Female']['avg_monthly_wage'].mean()) / 
               x[x['gender'] == 'Male']['avg_monthly_wage'].mean() * 100)
)
for sector, gap in sector_gaps.items():
    print(f"   - {sector}: {gap:.2f}% gap")

print("\n5. TEMPORAL TRENDS:")
trend_change = yearly_df.iloc[-1]['Gap (%)'] - yearly_df.iloc[0]['Gap (%)']
if trend_change < 0:
    print(f"   - Gap is DECREASING: {abs(trend_change):.2f} percentage points over {len(yearly_df)} years")
else:
    print(f"   - Gap is INCREASING: {trend_change:.2f} percentage points over {len(yearly_df)} years")

print("\n" + "=" * 70)

## 9. Export Analysis Results

In [None]:
# Save analysis results
results_df.to_csv('../data/cleaned/country_analysis.csv', index=False)
edu_df.to_csv('../data/cleaned/education_analysis.csv', index=False)
yearly_df.to_csv('../data/cleaned/yearly_trends.csv', index=False)

print("✓ All analysis results exported to data/cleaned/")
print("✓ Analysis complete!")