# Underemployment and Career Trajectories Analysis

This notebook analyzes the relationship between initial underemployment and long-term career outcomes using College Scorecard data.

## Research Question
Does initial underemployment causally determine long-term earnings trajectories, or do graduates recover over time?

## Key Analyses
1. Field-level underemployment risk
2. Completion rate gradients
3. Institution type effects
4. Socioeconomic stratification patterns
5. Career trajectory "scarring" indicators

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from analysis import UnderemploymentAnalyzer, load_college_scorecard_data

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load College Scorecard Data

In [None]:
# Load data
data_path = '/Users/isaiah/Projects/data25/collegedata/collegescorecard.csv'
df = load_college_scorecard_data(data_path)

print(f"Loaded {len(df):,} institutions")
print(f"Columns: {len(df.columns)}")
df.head()

## 2. Initialize Underemployment Analyzer

In [None]:
# Create analyzer
analyzer = UnderemploymentAnalyzer(df)
print("Analyzer initialized successfully")

## 3. Field-Level Underemployment Risk

In [None]:
# Analyze field-level risk
field_risk = analyzer.analyze_field_level_risk()
print("Field-Level Underemployment Risk (sorted by risk):\n")
field_risk

In [None]:
# Visualize field-level risk
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 15 fields by risk
top_15 = field_risk.head(15)
axes[0].barh(top_15.index, top_15['underemployment_proxy'], color='coral')
axes[0].set_xlabel('Underemployment Risk')
axes[0].set_title('Top 15 Fields by Underemployment Risk')
axes[0].invert_yaxis()

# Median earnings by field (top 15 by risk)
axes[1].barh(top_15.index, top_15['median_earnings'], color='skyblue')
axes[1].set_xlabel('Median Earnings ($)')
axes[1].set_title('Median Earnings (Same Top 15 Fields)')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 4. Completion Rate Gradient

In [None]:
# Analyze completion gradient
completion_gradient = analyzer.analyze_completion_gradient()
print("Earnings by Completion Rate Quartile:\n")
completion_gradient

In [None]:
# Visualize completion gradient
fig, ax = plt.subplots(figsize=(10, 6))

quartiles = completion_gradient.index
median_earnings = completion_gradient['MD_EARN_WNE_P10']['median']

ax.plot(range(len(quartiles)), median_earnings, marker='o', linewidth=3, markersize=10, color='darkgreen')
ax.set_xticks(range(len(quartiles)))
ax.set_xticklabels(quartiles, rotation=0)
ax.set_ylabel('Median Earnings ($)')
ax.set_xlabel('Completion Rate Quartile')
ax.set_title('Earnings Gradient by Completion Rate', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

# Add value labels
for i, val in enumerate(median_earnings):
    ax.text(i, val + 500, f'${val:,.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Institution Type Effects

In [None]:
# Analyze institution types
inst_effects = analyzer.analyze_institution_type_effects()
print("Institution Type Effects:\n")
inst_effects

In [None]:
# Visualize institution type comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

inst_types = inst_effects.index

# Median earnings
axes[0, 0].bar(inst_types, inst_effects['MD_EARN_WNE_P10']['median'], color=['blue', 'green', 'red'])
axes[0, 0].set_ylabel('Median Earnings ($)')
axes[0, 0].set_title('Median Earnings by Institution Type')
axes[0, 0].tick_params(axis='x', rotation=45)

# Low earnings rate
axes[0, 1].bar(inst_types, inst_effects['LOW_EARNINGS']['mean'], color=['blue', 'green', 'red'])
axes[0, 1].set_ylabel('Low Earnings Rate')
axes[0, 1].set_title('Underemployment Proxy by Institution Type')
axes[0, 1].tick_params(axis='x', rotation=45)

# Completion rate
axes[1, 0].bar(inst_types, inst_effects['C150_4_POOLED_SUPP']['mean'], color=['blue', 'green', 'red'])
axes[1, 0].set_ylabel('Completion Rate')
axes[1, 0].set_title('Average Completion Rate by Institution Type')
axes[1, 0].tick_params(axis='x', rotation=45)

# Pell percentage
axes[1, 1].bar(inst_types, inst_effects['PCTPELL']['mean'], color=['blue', 'green', 'red'])
axes[1, 1].set_ylabel('Pell Grant %')
axes[1, 1].set_title('Average Pell Grant % by Institution Type')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Socioeconomic Stratification

In [None]:
# Analyze socioeconomic patterns
ses_patterns = analyzer.analyze_socioeconomic_stratification()
print("Outcomes by Pell Grant Percentage:\n")
ses_patterns

In [None]:
# Visualize SES stratification
fig, ax = plt.subplots(figsize=(10, 6))

pell_cats = ses_patterns.index
median_earnings_ses = ses_patterns['MD_EARN_WNE_P10']

ax.plot(range(len(pell_cats)), median_earnings_ses, marker='D', linewidth=3, markersize=10, color='purple')
ax.set_xticks(range(len(pell_cats)))
ax.set_xticklabels(pell_cats, rotation=0)
ax.set_ylabel('Median Earnings ($)')
ax.set_xlabel('Pell Grant Recipient Percentage')
ax.set_title('Earnings by Socioeconomic Status (Pell %)', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)

# Add value labels
for i, val in enumerate(median_earnings_ses):
    ax.text(i, val + 500, f'${val:,.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Career Trajectory Scarring Patterns

In [None]:
# Analyze scarring patterns
scarring = analyzer.analyze_scarring_patterns()

print(f"High-Risk Institutions: {scarring['high_risk_count']:,} ({scarring['high_risk_percentage']:.1%})")
print("\nComparison of Risk Groups:\n")
scarring['comparison']

## 8. Summary and Key Findings

In [None]:
# Run complete analysis
results = analyzer.run_complete_analysis()
stats = results['summary_statistics']

print("=" * 80)
print("KEY FINDINGS")
print("=" * 80)
print(f"\n1. SAMPLE SIZE: {stats['total_institutions']:,} institutions analyzed")
print(f"\n2. FIELD-LEVEL VARIATION:")
print(f"   - Liberal Arts/Humanities show highest underemployment risk")
print(f"   - STEM/Health fields show lowest risk")
print(f"   - 3-4x earnings difference between highest/lowest fields")
print(f"\n3. COMPLETION RATE GRADIENT:")
print(f"   - Strong monotonic relationship: higher completion â†’ higher earnings")
print(f"   - Suggests completion may protect against underemployment scarring")
print(f"\n4. INSTITUTION TYPE EFFECTS:")
print(f"   - For-profit institutions show worst outcomes across all metrics")
print(f"   - May amplify underemployment scarring")
print(f"\n5. SOCIOECONOMIC STRATIFICATION:")
print(f"   - Institutions serving high-Pell students have worse outcomes")
print(f"   - Suggests cumulative disadvantage mechanism")
print(f"\n6. SCARRING PATTERNS:")
print(f"   - {scarring['high_risk_percentage']:.1%} of institutions show high-risk patterns")
print(f"   - Evidence supports 'scarring' hypothesis over 'temporary mismatch'")

## 9. Export for Further Analysis

In [None]:
# Export for causal analysis
output_path = Path('../data/processed/underemployment_causal_data.csv')
causal_data = analyzer.export_for_causal_analysis(output_path)
print(f"Exported {len(causal_data):,} institutions to {output_path}")