# Feature Engineering

This notebook covers:
- Creating new features from existing data
- Feature encoding and transformation
- Feature scaling and normalization
- Feature selection techniques

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

# Add src directory to path
sys.path.append('../src')

from data_preprocessing import DataPreprocessor
from feature_engineering import FeatureEngineer

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
# Load the EDA data
df = pd.read_csv('../data/processed/eda_data.csv')
print(f"Loaded dataset with shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

## 2. Categorical Encoding

In [None]:
# Initialize preprocessor and encode categorical features
preprocessor = DataPreprocessor()

# Define categorical columns
categorical_cols = ['gender', 'symptoms', 'diagnosis', 'previous_treatment', 'severity', 'recommended_treatment', 'outcome']

# Remove risk_profile if it exists (we'll recreate it)
if 'risk_profile' in df.columns:
    categorical_cols.append('risk_profile')

# Encode categorical features
df_encoded = preprocessor.encode_categorical_features(df, categorical_cols)

print("Categorical encoding completed.")
print(f"Encoded dataset shape: {df_encoded.shape}")

# Show encoding mappings
print("\nEncoding mappings:")
for col, encoder in preprocessor.label_encoders.items():
    print(f"{col}: {dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))}")

## 3. Feature Engineering

In [None]:
# Initialize feature engineer
feature_engineer = FeatureEngineer()

# Apply all feature engineering steps
df_engineered = feature_engineer.engineer_all_features(df_encoded)

print(f"Feature engineering completed.")
print(f"Original features: {df_encoded.shape[1]}")
print(f"Engineered features: {df_engineered.shape[1]}")
print(f"New features added: {df_engineered.shape[1] - df_encoded.shape[1]}")

# Display new features
new_features = [col for col in df_engineered.columns if col not in df_encoded.columns]
print(f"\nNew features created: {new_features}")

df_engineered.head()

## 4. Feature Analysis

In [None]:
# Analyze new features
plt.figure(figsize=(15, 12))

# Age groups distribution
plt.subplot(2, 3, 1)
if 'age_group' in df_engineered.columns:
    df_engineered['age_group'].value_counts().plot(kind='bar')
    plt.title('Age Group Distribution')
    plt.xticks(rotation=45)

# Severity score distribution
plt.subplot(2, 3, 2)
if 'severity_score' in df_engineered.columns:
    plt.hist(df_engineered['severity_score'], bins=10, edgecolor='black')
    plt.title('Severity Score Distribution')
    plt.xlabel('Severity Score')

# Treatment history flag
plt.subplot(2, 3, 3)
if 'has_previous_treatment' in df_engineered.columns:
    df_engineered['has_previous_treatment'].value_counts().plot(kind='bar')
    plt.title('Previous Treatment Flag')
    plt.xticks(rotation=0)

# Risk score distribution
plt.subplot(2, 3, 4)
if 'risk_score' in df_engineered.columns:
    plt.hist(df_engineered['risk_score'], bins=15, edgecolor='black')
    plt.title('Risk Score Distribution')
    plt.xlabel('Risk Score')

# Age-severity interaction
plt.subplot(2, 3, 5)
if 'age_severity_interaction' in df_engineered.columns:
    plt.scatter(df_engineered['age'], df_engineered['age_severity_interaction'], alpha=0.6)
    plt.title('Age vs Age-Severity Interaction')
    plt.xlabel('Age')
    plt.ylabel('Age-Severity Interaction')

# Correlation with outcome
plt.subplot(2, 3, 6)
numerical_features = df_engineered.select_dtypes(include=[np.number]).columns
outcome_correlations = df_engineered[numerical_features].corr()['outcome'].abs().sort_values(ascending=False)
outcome_correlations.drop('outcome').head(10).plot(kind='bar')
plt.title('Top 10 Features Correlated with Outcome')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 5. Feature Scaling

In [None]:
# Prepare features and target for scaling
feature_columns = [col for col in df_engineered.columns if col not in ['patient_id', 'outcome']]
X = df_engineered[feature_columns]
y = df_engineered['outcome']

print(f"Features for scaling: {X.shape[1]}")
print(f"Target variable: {y.name}")

# Split data for scaling
X_train, X_test, y_train, y_test = preprocessor.split_data(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale features
X_train_scaled, X_test_scaled = preprocessor.scale_features(X_train, X_test)

print("Feature scaling completed.")

# Compare before and after scaling
plt.figure(figsize=(15, 8))

# Before scaling
plt.subplot(2, 2, 1)
plt.boxplot([X_train[col] for col in X_train.columns[:5]], labels=X_train.columns[:5])
plt.title('Before Scaling (First 5 Features)')
plt.xticks(rotation=45)

# After scaling
plt.subplot(2, 2, 2)
plt.boxplot([X_train_scaled[:, i] for i in range(5)], labels=X_train.columns[:5])
plt.title('After Scaling (First 5 Features)')
plt.xticks(rotation=45)

# Distribution comparison for one feature
plt.subplot(2, 2, 3)
plt.hist(X_train.iloc[:, 0], bins=20, alpha=0.7, label='Original')
plt.hist(X_train_scaled[:, 0], bins=20, alpha=0.7, label='Scaled')
plt.title(f'Distribution Comparison: {X_train.columns[0]}')
plt.legend()

# Feature statistics
plt.subplot(2, 2, 4)
original_stats = X_train.describe().loc[['mean', 'std']].T
scaled_stats = pd.DataFrame(X_train_scaled).describe().loc[['mean', 'std']].T
scaled_stats.index = X_train.columns

plt.scatter(original_stats['mean'], original_stats['std'], alpha=0.7, label='Original')
plt.scatter(scaled_stats['mean'], scaled_stats['std'], alpha=0.7, label='Scaled')
plt.xlabel('Mean')
plt.ylabel('Standard Deviation')
plt.title('Mean vs Std: Original vs Scaled')
plt.legend()

plt.tight_layout()
plt.show()

## 6. Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

# Method 1: Statistical feature selection (ANOVA F-test)
selector_f = SelectKBest(score_func=f_classif, k=10)
X_train_selected_f = selector_f.fit_transform(X_train_scaled, y_train)

# Get selected feature names
selected_features_f = X_train.columns[selector_f.get_support()]
f_scores = selector_f.scores_[selector_f.get_support()]

print("Top 10 features by ANOVA F-test:")
for feature, score in zip(selected_features_f, f_scores):
    print(f"{feature}: {score:.2f}")

# Method 2: Mutual Information
selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_train_selected_mi = selector_mi.fit_transform(X_train_scaled, y_train)

selected_features_mi = X_train.columns[selector_mi.get_support()]
mi_scores = selector_mi.scores_[selector_mi.get_support()]

print("\nTop 10 features by Mutual Information:")
for feature, score in zip(selected_features_mi, mi_scores):
    print(f"{feature}: {score:.4f}")

# Method 3: Random Forest Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 features by Random Forest Importance:")
print(feature_importance.head(10))

In [None]:
# Visualize feature selection results
plt.figure(figsize=(18, 12))

# ANOVA F-test scores
plt.subplot(2, 3, 1)
f_scores_df = pd.DataFrame({
    'feature': selected_features_f,
    'score': f_scores
}).sort_values('score', ascending=True)

plt.barh(range(len(f_scores_df)), f_scores_df['score'])
plt.yticks(range(len(f_scores_df)), f_scores_df['feature'])
plt.title('ANOVA F-test Scores')
plt.xlabel('F-score')

# Mutual Information scores
plt.subplot(2, 3, 2)
mi_scores_df = pd.DataFrame({
    'feature': selected_features_mi,
    'score': mi_scores
}).sort_values('score', ascending=True)

plt.barh(range(len(mi_scores_df)), mi_scores_df['score'])
plt.yticks(range(len(mi_scores_df)), mi_scores_df['feature'])
plt.title('Mutual Information Scores')
plt.xlabel('MI Score')

# Random Forest Feature Importance
plt.subplot(2, 3, 3)
top_rf_features = feature_importance.head(10).sort_values('importance', ascending=True)
plt.barh(range(len(top_rf_features)), top_rf_features['importance'])
plt.yticks(range(len(top_rf_features)), top_rf_features['feature'])
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')

# Feature selection overlap
plt.subplot(2, 3, 4)
from matplotlib_venn import venn3
set_f = set(selected_features_f)
set_mi = set(selected_features_mi)
set_rf = set(feature_importance.head(10)['feature'])

venn3([set_f, set_mi, set_rf], ('ANOVA F-test', 'Mutual Info', 'Random Forest'))
plt.title('Feature Selection Method Overlap')

# Correlation between different scoring methods
plt.subplot(2, 3, 5)
# Create a comprehensive feature score dataframe
all_scores = pd.DataFrame({'feature': X_train.columns})
all_scores['f_score'] = selector_f.scores_
all_scores['mi_score'] = selector_mi.scores_
all_scores = all_scores.merge(feature_importance, on='feature')

plt.scatter(all_scores['f_score'], all_scores['mi_score'], alpha=0.6)
plt.xlabel('ANOVA F-score')
plt.ylabel('Mutual Information Score')
plt.title('F-score vs MI Score Correlation')

# Feature importance vs correlation
plt.subplot(2, 3, 6)
outcome_corr = df_engineered[feature_columns + ['outcome']].corr()['outcome'].abs()
all_scores['outcome_corr'] = all_scores['feature'].map(outcome_corr)

plt.scatter(all_scores['importance'], all_scores['outcome_corr'], alpha=0.6)
plt.xlabel('Random Forest Importance')
plt.ylabel('Absolute Correlation with Outcome')
plt.title('RF Importance vs Correlation')

plt.tight_layout()
plt.show()

## 7. Final Feature Set

In [None]:
# Combine results from different feature selection methods
# Take features that appear in at least 2 out of 3 methods
feature_votes = {}

# Count votes for each feature
for feature in X_train.columns:
    votes = 0
    if feature in set_f:
        votes += 1
    if feature in set_mi:
        votes += 1
    if feature in set_rf:
        votes += 1
    feature_votes[feature] = votes

# Select features with at least 2 votes
selected_features_final = [feature for feature, votes in feature_votes.items() if votes >= 2]

# If we don't have enough features, add top features from RF importance
if len(selected_features_final) < 8:
    additional_features = feature_importance.head(15)['feature'].tolist()
    for feature in additional_features:
        if feature not in selected_features_final:
            selected_features_final.append(feature)
        if len(selected_features_final) >= 12:
            break

print(f"Final selected features ({len(selected_features_final)}):")
for i, feature in enumerate(selected_features_final, 1):
    votes = feature_votes.get(feature, 0)
    rf_importance = feature_importance[feature_importance['feature'] == feature]['importance'].iloc[0]
    print(f"{i:2d}. {feature:<25} (votes: {votes}, RF importance: {rf_importance:.4f})")

# Create final feature sets
X_train_final = X_train[selected_features_final]
X_test_final = X_test[selected_features_final]

# Scale the final feature sets
X_train_final_scaled, X_test_final_scaled = preprocessor.scale_features(X_train_final, X_test_final)

print(f"\nFinal training set shape: {X_train_final_scaled.shape}")
print(f"Final test set shape: {X_test_final_scaled.shape}")

## 8. Feature Engineering Summary

In [None]:
# Create summary of feature engineering process
print("=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)

print(f"\n1. ORIGINAL DATASET:")
print(f"   - Original features: {df.shape[1]}")
print(f"   - Samples: {df.shape[0]:,}")

print(f"\n2. CATEGORICAL ENCODING:")
print(f"   - Categorical columns encoded: {len(categorical_cols)}")
print(f"   - Encoding method: Label Encoding")

print(f"\n3. FEATURE ENGINEERING:")
print(f"   - Features after engineering: {df_engineered.shape[1]}")
print(f"   - New features created: {len(new_features)}")
print(f"   - New features: {', '.join(new_features)}")

print(f"\n4. FEATURE SCALING:")
print(f"   - Scaling method: MinMaxScaler")
print(f"   - Features scaled: {X_train.shape[1]}")

print(f"\n5. FEATURE SELECTION:")
print(f"   - Selection methods used: 3 (ANOVA F-test, Mutual Information, Random Forest)")
print(f"   - Features before selection: {X_train.shape[1]}")
print(f"   - Features after selection: {len(selected_features_final)}")
print(f"   - Reduction: {(1 - len(selected_features_final)/X_train.shape[1])*100:.1f}%")

print(f"\n6. FINAL DATASET:")
print(f"   - Training samples: {X_train_final_scaled.shape[0]:,}")
print(f"   - Test samples: {X_test_final_scaled.shape[0]:,}")
print(f"   - Final features: {X_train_final_scaled.shape[1]}")

print(f"\n7. NEXT STEPS:")
print(f"   - Proceed to model training with engineered features")
print(f"   - Use selected features for better model performance")
print(f"   - Consider feature interactions in model selection")

In [None]:
# Save engineered data and selected features
df_engineered.to_csv('../data/processed/engineered_data.csv', index=False)

# Save selected features list
pd.DataFrame({'selected_features': selected_features_final}).to_csv('../data/processed/selected_features.csv', index=False)

# Save train/test splits
pd.DataFrame(X_train_final_scaled, columns=selected_features_final).to_csv('../data/processed/X_train_final.csv', index=False)
pd.DataFrame(X_test_final_scaled, columns=selected_features_final).to_csv('../data/processed/X_test_final.csv', index=False)
pd.DataFrame({'y_train': y_train}).to_csv('../data/processed/y_train.csv', index=False)
pd.DataFrame({'y_test': y_test}).to_csv('../data/processed/y_test.csv', index=False)

print("\nAll engineered data and splits saved to '../data/processed/'")
print("Ready for model training!")