# Part 2: Feature Engineering and Selection

This notebook focuses on feature engineering, selection, and analysis for the Length of Stay prediction project.

In [None]:
# Import required libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML
import ipywidgets as widgets
import seaborn as sns
import matplotlib.pyplot as plt

from src.config.config import ModelConfig
from src.utils.feature_engineering import engineer_features
from src.utils.feature_selection import select_features_shap

## 1. Load Processed Data

In [None]:
# Load data from Part 1
df = pd.read_pickle('data/initial_df.pkl')
print("Loaded initial dataset with shape:", df.shape)
display(df.head())

## 2. Feature Engineering

In [None]:
# Apply feature engineering
df_engineered = engineer_features(df)

# Show new features
new_features = set(df_engineered.columns) - set(df.columns)
print("\nNewly created features:")
display(df_engineered[list(new_features)].head())

# Display summary statistics for new features
print("\nSummary statistics for new features:")
display(df_engineered[list(new_features)].describe())

## 3. Correlation Analysis

In [None]:
def plot_correlation_matrix(df, min_correlation=0.0):
    """Create interactive correlation matrix plot"""
    # Calculate correlation matrix for numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    corr = df[numeric_cols].corr()
    
    # Apply threshold mask
    mask = np.abs(corr) >= min_correlation
    corr_filtered = corr * mask
    
    # Create heatmap
    fig = px.imshow(
        corr_filtered,
        labels=dict(color="Correlation"),
        title=f"Feature Correlation Matrix (|correlation| ≥ {min_correlation})",
        color_continuous_scale='RdBu',
        aspect='auto'
    )
    
    fig.update_layout(
        height=800,
        width=800,
        title_x=0.5
    )
    
    return fig

# Create correlation slider widget
correlation_slider = widgets.FloatSlider(
    value=0.0,
    min=0.0,
    max=1.0,
    step=0.05,
    description='Min Correlation:'
)

widgets.interact(
    lambda x: plot_correlation_matrix(df_engineered, x).show(),
    min_correlation=correlation_slider
)

## 4. Feature Selection using SHAP

In [None]:
# Perform feature selection
feature_importance = select_features_shap(df_engineered, 'length_of_stay')

# Create feature importance plot
fig = px.bar(
    feature_importance,
    x='importance',
    y='feature',
    orientation='h',
    title='Feature Importance (SHAP values)',
    labels={'importance': 'SHAP Value', 'feature': 'Feature'}
)

fig.update_layout(
    height=800,
    yaxis={'categoryorder': 'total ascending'},
    title_x=0.5
)

fig.show()

## 5. Feature Distribution Analysis

In [None]:
def analyze_feature_distribution(df, feature):
    """Create detailed distribution analysis for a feature"""
    fig = go.Figure()
    
    # Add histogram
    fig.add_trace(go.Histogram(
        x=df[feature],
        name='Distribution',
        nbinsx=30
    ))
    
    # Add kernel density estimation if numerical
    if df[feature].dtype in ['int64', 'float64']:
        from scipy import stats
        kde_x = np.linspace(df[feature].min(), df[feature].max(), 100)
        kde = stats.gaussian_kde(df[feature].dropna())
        fig.add_trace(go.Scatter(
            x=kde_x,
            y=kde(kde_x) * len(df[feature]) * (df[feature].max() - df[feature].min()) / 30,
            name='KDE',
            line=dict(color='red')
        ))
        
        # Add relationship with target
        corr = df[feature].corr(df['length_of_stay'])
        print(f"Correlation with length_of_stay: {corr:.3f}")
    
    fig.update_layout(
        title=f'Distribution of {feature}',
        xaxis_title=feature,
        yaxis_title='Count',
        height=500
    )
    
    return fig

# Create feature selection dropdown
feature_dropdown = widgets.Dropdown(
    options=feature_importance['feature'].tolist(),
    description='Feature:',
    style={'description_width': 'initial'}
)

widgets.interact(
    lambda x: analyze_feature_distribution(df_engineered, x).show(),
    feature=feature_dropdown
)

## 6. Save Processed Data

In [None]:
# Save engineered data and feature importance
df_engineered.to_pickle('data/engineered_df.pkl')
feature_importance.to_pickle('data/feature_importance.pkl')

print("Feature engineering complete. Processed data saved. Proceed to Part 3 for graph construction.")