# EDA and Data Transformations

**Author:** Alexis Alduncin (Data Scientist)
**Team:** MLOps 62

This notebook performs comprehensive Exploratory Data Analysis and applies the transformation pipeline using our custom modules.

In [None]:
# Setup and imports
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
from src import config
from src.features import AbsenteeismFeatureEngine
from src.data_utils import load_data
from src.plots import (
    plot_target_distribution,
    plot_correlation_matrix,
    create_eda_summary_dashboard,
    plot_categorical_analysis,
    plot_numerical_relationship
)

print("✅ Modules imported successfully")
print(f"MLflow Experiment: {config.MLFLOW_EXPERIMENT_NAME}")
print(f"Data Source: {config.RAW_DATA_PATH}")

## 1. Data Loading with DVC

In [None]:
# Load data using team's robust DVC approach
df = load_data(config.RAW_DATA_PATH)

print(f"Dataset loaded: {df.shape}")
print(f"Features: {df.shape[1]}")
print(f"Samples: {df.shape[0]}")

# Display first few rows
df.head()

## 2. Data Quality Assessment

In [None]:
# Basic info
print("="*60)
print("DATA QUALITY REPORT")
print("="*60)

print(f"\nShape: {df.shape}")
print(f"\nMissing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("  ✅ No missing values")
else:
    print(missing[missing > 0])

print(f"\nDuplicates: {df.duplicated().sum()}")

print(f"\nData Types:")
print(df.dtypes.value_counts())

# Summary statistics
df.describe()

## 3. Target Variable Analysis

In [None]:
# Target distribution using our custom plot
fig, axes = plot_target_distribution(df, config.TARGET_COLUMN)
plt.show()

# Target statistics
print("\nTarget Variable Statistics:")
print(f"Mean: {df[config.TARGET_COLUMN].mean():.2f} hours")
print(f"Median: {df[config.TARGET_COLUMN].median():.2f} hours")
print(f"Std Dev: {df[config.TARGET_COLUMN].std():.2f} hours")
print(f"Min: {df[config.TARGET_COLUMN].min():.0f} hours")
print(f"Max: {df[config.TARGET_COLUMN].max():.0f} hours")

# Check for outliers
Q1 = df[config.TARGET_COLUMN].quantile(0.25)
Q3 = df[config.TARGET_COLUMN].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[config.TARGET_COLUMN] < Q1 - 1.5*IQR) | 
              (df[config.TARGET_COLUMN] > Q3 + 1.5*IQR)]
print(f"\nOutliers (IQR method): {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")

## 4. Comprehensive EDA Dashboard

In [None]:
# Create comprehensive EDA dashboard
fig = create_eda_summary_dashboard(df)
plt.show()

print("\n✅ EDA Dashboard created with 7 visualizations")

## 5. Correlation Analysis

In [None]:
# Correlation matrix
fig = plot_correlation_matrix(df, method='pearson')
plt.show()

# Top correlations with target
numeric_df = df.select_dtypes(include=[np.number])
target_corr = numeric_df.corr()[config.TARGET_COLUMN].sort_values(ascending=False)

print("\nTop 10 Correlations with Target:")
print(target_corr.head(11)[1:])  # Exclude self-correlation

## 6. Categorical Features Analysis

In [None]:
# Analyze key categorical features
categorical_features = ['Day of the week', 'Seasons', 'Education']

for feature in categorical_features:
    if feature in df.columns:
        print(f"\n{'='*60}")
        print(f"Analyzing: {feature}")
        print(f"{'='*60}")
        fig = plot_categorical_analysis(df, feature)
        plt.show()

## 7. Numerical Features Analysis

In [None]:
# Analyze key numerical features
numerical_features = ['Age', 'Distance from Residence to Work', 'Body mass index']

for feature in numerical_features:
    if feature in df.columns:
        print(f"\n{'='*60}")
        print(f"Analyzing: {feature}")
        print(f"{'='*60}")
        fig = plot_numerical_relationship(df, feature)
        plt.show()

## 8. Data Transformation Pipeline

In [None]:
# Initialize feature engine
engine = AbsenteeismFeatureEngine()

print("Step 1: Data Cleaning")
df_clean = engine.clean_data(df)
print(f"  Original: {len(df)} rows")
print(f"  Cleaned: {len(df_clean)} rows")
print(f"  Removed: {len(df) - len(df_clean)} rows ({(len(df)-len(df_clean))/len(df)*100:.1f}%)")

print("\nStep 2: Feature Engineering")
df_features = engine.engineer_features(df_clean)
print(f"  Original features: {len(df.columns)}")
print(f"  After engineering: {len(df_features.columns)}")
print(f"  New features: {len(df_features.columns) - len(df.columns)}")

# Show new features
new_cols = set(df_features.columns) - set(df.columns)
print("\n  Created features:")
for col in sorted(new_cols):
    print(f"    - {col}")

# Display sample
print("\nSample of transformed data:")
df_features[list(new_cols)].head()

## 9. Prepare for Modeling

In [None]:
# Prepare features for modeling
X, y = engine.prepare_for_modeling(df_features, scale_features=True)

print("Model-Ready Data:")
print(f"  Features (X): {X.shape}")
print(f"  Target (y): {y.shape}")
print(f"  Feature names: {len(engine.feature_names)}")

print("\nFeature List:")
for i, feat in enumerate(engine.feature_names, 1):
    print(f"{i:2d}. {feat}")

## 10. Save Processed Data

In [None]:
# Save feature-engineered data
import os
os.makedirs(config.PROCESSED_DATA_PATH, exist_ok=True)

output_path = os.path.join(config.PROCESSED_DATA_PATH, 'absenteeism_features.csv')
df_features.to_csv(output_path, index=False)
print(f"✅ Saved feature-engineered data to: {output_path}")

# Save model-ready data
X_path = os.path.join(config.PROCESSED_DATA_PATH, 'X_features.csv')
y_path = os.path.join(config.PROCESSED_DATA_PATH, 'y_target.csv')

X.to_csv(X_path, index=False)
y.to_csv(y_path, index=False, header=['Absenteeism time in hours'])

print(f"✅ Saved model-ready features to: {X_path}")
print(f"✅ Saved target variable to: {y_path}")

## Summary

### EDA Insights
1. **Data Quality:** No missing values, clean dataset
2. **Target Distribution:** Right-skewed, mean ~7 hours, median ~3 hours
3. **Outliers:** Present but expected (extended medical leave)
4. **Correlations:** Weak to moderate correlations with individual features
5. **Patterns:** Seasonal and day-of-week variations observed

### Transformation Results
- ✅ Data cleaned and outliers handled
- ✅ 7 new features engineered
- ✅ Features encoded and scaled for modeling
- ✅ Data saved to `data/processed/`

### Next Steps
Proceed to `03-aa-feature-engineering.ipynb` for detailed feature analysis, then `04-aa-model-experiments.ipynb` for model training with MLflow.