In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import polars as pl
from itertools import combinations

In [17]:
data = pl.read_parquet("../data/processed/wioa_data_tier2.parquet")

In [20]:
# Prepare data for modeling (encode categorical variables)
tier2_data = data.filter(pl.col("outcome_tier") == "Tier 2").to_pandas()
tier2_data = data.filter(pl.col("outcome_tier") == "Tier 2")

In [7]:
dimensions = [
    'low_income_x', 'employment_status_x', 'received_training_x',
    'race_ethnicity_x', 'sex_x', 'age_x', 'highest_education_level_x',
    'training_service_type_1_x', 'industry_title_x'
]

In [8]:
# Encode categorical variables
encoders = {}
X = tier2_data[dimensions].copy()
for col in dimensions:
    if X[col].dtype == 'object':
        encoders[col] = LabelEncoder()
        X[col] = encoders[col].fit_transform(X[col].astype(str))

In [9]:
# For each outcome variable, get feature importance
outcomes = ['bin_r_cog_industry_y', 'bin_r_man_industry_y', 'bin_offshor_industry_y', 'bin_wages_mean_y']

importance_scores = {}
for outcome in outcomes:
    y = tier2_data[outcome].dropna()
    X_clean = X.loc[y.index]
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf.fit(X_clean, y)
    
    importance_scores[outcome] = dict(zip(dimensions, rf.feature_importances_))

In [10]:
# Average importance across all outcomes
avg_importance = {dim: np.mean([importance_scores[outcome][dim] for outcome in outcomes]) 
                  for dim in dimensions}

# Sort by importance
sorted_dims = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)
print("Dimension importance rankings:")
for dim, score in sorted_dims:
    print(f"{dim}: {score:.4f}")

Dimension importance rankings:
industry_title_x: 0.5096
highest_education_level_x: 0.1304
race_ethnicity_x: 0.1076
age_x: 0.0869
employment_status_x: 0.0531
sex_x: 0.0400
training_service_type_1_x: 0.0292
received_training_x: 0.0223
low_income_x: 0.0209


In [13]:
correlation_matrix = X.corr()
high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.5:  # threshold for high correlation
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j]))

print("Highly correlated dimension pairs:")
for pair in high_corr_pairs:
    print(f"{pair[0]} <-> {pair[1]}: {correlation_matrix.loc[pair[0], pair[1]]:.3f}")

Highly correlated dimension pairs:
received_training_x <-> training_service_type_1_x: 0.666


In [12]:
high_corr_pairs

[]

In [14]:
from scipy.stats import chi2_contingency
import pandas as pd

# Test association between each dimension and outcomes
significance_scores = {}

for dim in dimensions:
    dim_scores = []
    for outcome in outcomes:
        # Create contingency table
        contingency = pd.crosstab(tier2_data[dim], tier2_data[outcome])
        
        # Chi-square test
        chi2, p_value, dof, expected = chi2_contingency(contingency)
        
        # Use negative log p-value as importance score (higher = more significant)
        importance = -np.log10(p_value + 1e-10)  # Add small value to avoid log(0)
        dim_scores.append(importance)
    
    significance_scores[dim] = np.mean(dim_scores)

# Sort by significance
sorted_significance = sorted(significance_scores.items(), key=lambda x: x[1], reverse=True)
print("\nDimension significance rankings:")
for dim, score in sorted_significance:
    print(f"{dim}: {score:.4f}")


Dimension significance rankings:
low_income_x: 10.0000
employment_status_x: 10.0000
race_ethnicity_x: 10.0000
sex_x: 10.0000
age_x: 10.0000
highest_education_level_x: 10.0000
training_service_type_1_x: 10.0000
industry_title_x: 10.0000
received_training_x: 7.8154


In [68]:
def consolidate_multiple_columns(data, columns, min_percentage=0.02):
    """Consolidate multiple columns at once, overwriting originals"""
    
    consolidated_data = data
    consolidated_columns = []
    non_consolidated_columns = []
    
    for column in columns:
        # Skip if column is not string/categorical
        if consolidated_data[column].dtype not in [pl.String, pl.Categorical]:
            non_consolidated_columns.append(column)
            print(f"{column}: skipped (not categorical)")
            continue
            
        total_count = consolidated_data.height
        value_counts = consolidated_data.select(pl.col(column).value_counts()).unnest(column)
        
        # Calculate percentages
        value_counts = value_counts.with_columns(
            (pl.col("count") / total_count).alias("percentage")
        )
        
        # Keep categories above threshold
        keep_categories = value_counts.filter(
            pl.col("percentage") >= min_percentage
        ).select(column).to_series().to_list()
        
        original_categories = value_counts.height
        kept_categories = len(keep_categories)
        
        # Only consolidate if we're actually reducing categories
        if kept_categories < original_categories:
            # Overwrite original column with consolidated version
            consolidated_data = consolidated_data.with_columns(
                pl.when(pl.col(column).is_in(keep_categories))
                .then(pl.col(column))
                .otherwise(pl.lit("Other"))
                .alias(column)  # Same name as original
            )
            consolidated_columns.append(column)
            print(f"{column}: consolidated to {kept_categories} categories (from {original_categories})")
        else:
            # No consolidation needed
            non_consolidated_columns.append(column)
            print(f"{column}: no consolidation needed ({original_categories} categories)")
    
    return consolidated_data, consolidated_columns, non_consolidated_columns

# Use it - much cleaner!
tier2_data, consolidated_cols, non_consolidated_cols = consolidate_multiple_columns(
    tier2_data, dimensions, min_percentage=0.02
)

print(f"\nSummary:")
print(f"Consolidated: {len(consolidated_cols)} columns")
print(f"Unchanged: {len(non_consolidated_cols)} columns")

# Use the same dimension names for rollup - no need to track different column names!
grouping_sets = [list(c) for i in range(1, len(dimensions)+1) for c in combinations(dimensions, i)]

low_income_x: no consolidation needed (2 categories)
employment_status_x: consolidated to 2 categories (from 3)
received_training_x: no consolidation needed (2 categories)
race_ethnicity_x: consolidated to 6 categories (from 7)
sex_x: consolidated to 2 categories (from 3)
age_x: consolidated to 6 categories (from 7)
highest_education_level_x: consolidated to 8 categories (from 9)
training_service_type_1_x: consolidated to 2 categories (from 3)
industry_title_x: no consolidation needed (13 categories)

Summary:
Consolidated: 6 columns
Unchanged: 3 columns


In [64]:
tier2_data_consolidated.select(pl.col("training_service_type_1_x")).unique()

training_service_type_1_x
str
"""Job Readiness Training in conj…"
"""ABE or ESL (contextualized or …"
"""Entrepreneurial Training (non-…"
"""ABE or ESL (contextualized or …"
"""On the Job Training (non-WIOA …"
…
"""Customized Training"""
"""Prerequisite Training"""
"""Other Non-Occupational-Skills …"
"""No Training Service"""


In [65]:
tier2_data_consolidated.select(pl.col("training_service_type_1_x_consolidated")).unique()

training_service_type_1_x_consolidated
str
"""Occupational Skills Training (…"
"""Other"""
"""No Training Service"""
