# Labor Dynamics Analysis: Data Exploration

This notebook provides an initial exploration of college enrollment and employment data to understand the relationship between higher education participation and labor market dynamics.

## Objectives
1. Load and examine college enrollment data
2. Load and examine employment/unemployment data
3. Perform basic data quality assessment
4. Explore temporal trends in both datasets
5. Identify potential relationships for deeper analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from pathlib import Path

# Add src to path for imports
sys.path.append('../src')

# Import our custom modules
from data_collection import fetch_enrollment_data, fetch_employment_data, process_and_merge

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['figure.dpi'] = 100

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Analysis Period: 2000-2024")

## 1. Data Collection

Let's start by collecting enrollment and employment data for our analysis period.

In [None]:
# Define analysis period
ANALYSIS_YEARS = range(2000, 2025)

print(f"Collecting data for years {ANALYSIS_YEARS.start} to {ANALYSIS_YEARS.stop-1}...")

# Fetch enrollment data (using synthetic data for development)
enrollment_data = fetch_enrollment_data(ANALYSIS_YEARS, use_synthetic=True)
print(f"âœ“ Enrollment data collected: {len(enrollment_data)} records")

# Fetch employment data (note: will need BLS API key for real data)
try:
    employment_data = fetch_employment_data(ANALYSIS_YEARS)
    print(f"âœ“ Employment data collected: {len(employment_data)} records")
except Exception as e:
    print(f"âš  Employment data collection failed: {e}")
    print("Using synthetic employment data for demonstration...")
    
    # Create synthetic employment data for demo
    years = list(ANALYSIS_YEARS)
    employment_data = pd.DataFrame({
        'year': years,
        'civilian_labor_force': np.random.normal(160000, 5000, len(years)),
        'employment_level': np.random.normal(150000, 5000, len(years)),
        'unemployment_rate': np.random.normal(6.0, 2.0, len(years))
    })
    print(f"âœ“ Synthetic employment data created: {len(employment_data)} records")

print("\nData collection completed!")

## 2. Data Overview

Let's examine the structure and basic statistics of our datasets.

In [None]:
# Enrollment data overview
print("=" * 50)
print("ENROLLMENT DATA OVERVIEW")
print("=" * 50)
print(f"Shape: {enrollment_data.shape}")
print(f"Columns: {list(enrollment_data.columns)}")
print(f"Data types:\n{enrollment_data.dtypes}")
print(f"\nYear range: {enrollment_data['year'].min()} - {enrollment_data['year'].max()}")

# Display first few rows
print("\nFirst 5 rows:")
display(enrollment_data.head())

# Basic statistics
print("\nBasic Statistics:")
display(enrollment_data.describe())

In [None]:
# Employment data overview
print("=" * 50)
print("EMPLOYMENT DATA OVERVIEW")
print("=" * 50)
print(f"Shape: {employment_data.shape}")
print(f"Columns: {list(employment_data.columns)}")
print(f"Data types:\n{employment_data.dtypes}")
print(f"\nYear range: {employment_data['year'].min()} - {employment_data['year'].max()}")

# Display first few rows
print("\nFirst 5 rows:")
display(employment_data.head())

# Basic statistics
print("\nBasic Statistics:")
display(employment_data.describe())

## 3. Data Quality Assessment

Check for missing values, outliers, and data quality issues.

In [None]:
# Check for missing values
print("MISSING VALUES ANALYSIS")
print("=" * 30)

print("\nEnrollment Data Missing Values:")
enrollment_missing = enrollment_data.isnull().sum()
print(enrollment_missing[enrollment_missing > 0])

print("\nEmployment Data Missing Values:")
employment_missing = employment_data.isnull().sum()
print(employment_missing[employment_missing > 0])

# Check for duplicates
print(f"\nDuplicate rows in enrollment data: {enrollment_data.duplicated().sum()}")
print(f"Duplicate rows in employment data: {employment_data.duplicated().sum()}")

## 4. Temporal Trends Analysis

Visualize trends in enrollment and employment over time.

In [None]:
# Create comprehensive trend plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Labor Dynamics: Enrollment vs Employment Trends (2000-2024)', fontsize=16, fontweight='bold')

# Plot 1: Total Enrollment Trend
if 'total_enrollment' in enrollment_data.columns:
    axes[0,0].plot(enrollment_data['year'], enrollment_data['total_enrollment']/1_000_000, 
                   marker='o', linewidth=2, markersize=4, color='blue')
    axes[0,0].set_title('Total College Enrollment', fontweight='bold')
    axes[0,0].set_xlabel('Year')
    axes[0,0].set_ylabel('Enrollment (Millions)')
    axes[0,0].grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(enrollment_data['year'], enrollment_data['total_enrollment']/1_000_000, 1)
    p = np.poly1d(z)
    axes[0,0].plot(enrollment_data['year'], p(enrollment_data['year']), "--", alpha=0.7, color='red')

# Plot 2: Employment Level Trend
if 'employment_level' in employment_data.columns:
    axes[0,1].plot(employment_data['year'], employment_data['employment_level']/1_000, 
                   marker='s', linewidth=2, markersize=4, color='green')
    axes[0,1].set_title('Employment Level', fontweight='bold')
    axes[0,1].set_xlabel('Year')
    axes[0,1].set_ylabel('Employment (Thousands)')
    axes[0,1].grid(True, alpha=0.3)

# Plot 3: Unemployment Rate
if 'unemployment_rate' in employment_data.columns:
    axes[1,0].plot(employment_data['year'], employment_data['unemployment_rate'], 
                   marker='^', linewidth=2, markersize=4, color='red')
    axes[1,0].set_title('Unemployment Rate', fontweight='bold')
    axes[1,0].set_xlabel('Year')
    axes[1,0].set_ylabel('Unemployment Rate (%)')
    axes[1,0].grid(True, alpha=0.3)

# Plot 4: Enrollment Breakdown (if available)
if all(col in enrollment_data.columns for col in ['undergraduate', 'graduate']):
    axes[1,1].plot(enrollment_data['year'], enrollment_data['undergraduate']/1_000_000, 
                   marker='o', linewidth=2, label='Undergraduate', alpha=0.8)
    axes[1,1].plot(enrollment_data['year'], enrollment_data['graduate']/1_000_000, 
                   marker='s', linewidth=2, label='Graduate', alpha=0.8)
    axes[1,1].set_title('Enrollment by Level', fontweight='bold')
    axes[1,1].set_xlabel('Year')
    axes[1,1].set_ylabel('Enrollment (Millions)')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
else:
    axes[1,1].text(0.5, 0.5, 'Enrollment breakdown\ndata not available', 
                   ha='center', va='center', transform=axes[1,1].transAxes, fontsize=12)
    axes[1,1].set_title('Enrollment Breakdown', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Initial Correlation Analysis

Merge the datasets and look for potential relationships.

In [None]:
# Process and merge datasets
print("Processing and merging datasets...")
merged_data = process_and_merge(enrollment_data, employment_data, save_cache=True)

if not merged_data.empty:
    print(f"âœ“ Merged dataset created: {merged_data.shape}")
    print(f"Columns: {list(merged_data.columns)}")
    
    # Display merged data sample
    print("\nMerged Data Sample:")
    display(merged_data.head(10))
else:
    print("âš  Failed to merge datasets")

In [None]:
# Correlation analysis
if not merged_data.empty:
    # Select numeric columns for correlation
    numeric_cols = merged_data.select_dtypes(include=[np.number]).columns
    correlation_matrix = merged_data[numeric_cols].corr()
    
    # Create correlation heatmap
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                square=True, fmt='.3f', cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix: Enrollment vs Employment Metrics', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Highlight key correlations
    print("\nKey Correlations (|r| > 0.3):")
    print("=" * 40)
    
    # Find correlations between enrollment and employment variables
    enrollment_cols = [col for col in numeric_cols if 'enrollment' in col]
    employment_cols = [col for col in numeric_cols if 'employment' in col or 'unemployment' in col]
    
    for enroll_col in enrollment_cols:
        for employ_col in employment_cols:
            if enroll_col != employ_col:
                corr_val = correlation_matrix.loc[enroll_col, employ_col]
                if abs(corr_val) > 0.3:
                    print(f"{enroll_col} â†” {employ_col}: {corr_val:.3f}")
else:
    print("Cannot perform correlation analysis - merged dataset is empty")

## 6. Key Insights and Next Steps

Summarize initial findings and outline areas for deeper analysis.

In [None]:
# Generate summary insights
print("="*60)
print("INITIAL DATA EXPLORATION SUMMARY")
print("="*60)

if not enrollment_data.empty:
    print("\nðŸ“Š ENROLLMENT DATA INSIGHTS:")
    print(f"  â€¢ Time period: {enrollment_data['year'].min()} - {enrollment_data['year'].max()}")
    print(f"  â€¢ Data points: {len(enrollment_data)} years")
    
    if 'total_enrollment' in enrollment_data.columns:
        total_change = enrollment_data['total_enrollment'].iloc[-1] - enrollment_data['total_enrollment'].iloc[0]
        pct_change = (total_change / enrollment_data['total_enrollment'].iloc[0]) * 100
        print(f"  â€¢ Total enrollment change: {total_change:,.0f} ({pct_change:+.1f}%)")
        
        avg_annual_growth = ((enrollment_data['total_enrollment'].iloc[-1] / 
                             enrollment_data['total_enrollment'].iloc[0]) ** 
                            (1/(len(enrollment_data)-1)) - 1) * 100
        print(f"  â€¢ Average annual growth: {avg_annual_growth:+.2f}%")

if not employment_data.empty:
    print("\nðŸ’¼ EMPLOYMENT DATA INSIGHTS:")
    print(f"  â€¢ Time period: {employment_data['year'].min()} - {employment_data['year'].max()}")
    print(f"  â€¢ Data points: {len(employment_data)} years")
    
    if 'unemployment_rate' in employment_data.columns:
        avg_unemployment = employment_data['unemployment_rate'].mean()
        min_unemployment = employment_data['unemployment_rate'].min()
        max_unemployment = employment_data['unemployment_rate'].max()
        print(f"  â€¢ Average unemployment rate: {avg_unemployment:.1f}%")
        print(f"  â€¢ Unemployment range: {min_unemployment:.1f}% - {max_unemployment:.1f}%")

if not merged_data.empty:
    print("\nðŸ”— RELATIONSHIP INSIGHTS:")
    print(f"  â€¢ Successfully merged {len(merged_data)} years of data")
    print(f"  â€¢ {len(correlation_matrix.columns)} metrics available for analysis")
    
    # Count significant correlations
    strong_correlations = np.sum(np.abs(correlation_matrix.values) > 0.5) - len(correlation_matrix)
    moderate_correlations = np.sum((np.abs(correlation_matrix.values) > 0.3) & 
                                  (np.abs(correlation_matrix.values) <= 0.5))
    print(f"  â€¢ Strong correlations (|r| > 0.5): {strong_correlations//2}")
    print(f"  â€¢ Moderate correlations (0.3 < |r| â‰¤ 0.5): {moderate_correlations//2}")

print("\nðŸ“‹ NEXT STEPS FOR ANALYSIS:")
print("  1. Detailed enrollment trend analysis by demographics and institution type")
print("  2. Employment pattern analysis across different economic cycles")
print("  3. Regional/geographic analysis of enrollment-employment relationships")
print("  4. Time-lagged correlation analysis (enrollment â†’ employment outcomes)")
print("  5. Economic event impact analysis (recessions, policy changes)")
print("  6. Predictive modeling for future trends")

print("\nâœ… Data exploration completed successfully!")
print("   Proceed to notebook 02_enrollment_analysis.ipynb for detailed enrollment analysis.")

---

## Data Export

Save cleaned and merged data for use in subsequent analyses.

In [None]:
# Save data for next notebooks
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save individual cleaned datasets
enrollment_data.to_csv(output_dir / 'clean_enrollment_data.csv', index=False)
employment_data.to_csv(output_dir / 'clean_employment_data.csv', index=False)

if not merged_data.empty:
    merged_data.to_csv(output_dir / 'merged_labor_education_data.csv', index=False)
    print(f"âœ“ Saved merged dataset: {len(merged_data)} records")

print(f"âœ“ Data saved to {output_dir}")
print("âœ… Analysis complete - ready for next phase!")