# Exploratory Data Analysis (EDA)

This notebook covers:
- Detailed data exploration
- Correlation analysis
- Treatment outcome patterns
- Patient demographic analysis
- Feature relationships

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sys

# Add src directory to path
sys.path.append('../src')

from data_preprocessing import DataPreprocessor

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Display options
pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
# Load the processed data
df = pd.read_csv('../data/processed/loaded_data.csv')
print(f"Loaded dataset with shape: {df.shape}")
df.head()

## 2. Demographic Analysis

In [None]:
# Age analysis by gender
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
sns.boxplot(data=df, x='gender', y='age')
plt.title('Age Distribution by Gender')

plt.subplot(2, 3, 2)
sns.histplot(data=df, x='age', hue='gender', bins=20, alpha=0.7)
plt.title('Age Histogram by Gender')

plt.subplot(2, 3, 3)
age_gender_crosstab = pd.crosstab(pd.cut(df['age'], bins=5), df['gender'])
age_gender_crosstab.plot(kind='bar', ax=plt.gca())
plt.title('Age Groups by Gender')
plt.xticks(rotation=45)

plt.subplot(2, 3, 4)
sns.boxplot(data=df, x='severity', y='age')
plt.title('Age Distribution by Severity')

plt.subplot(2, 3, 5)
sns.violinplot(data=df, x='outcome', y='age')
plt.title('Age Distribution by Outcome')
plt.xticks(rotation=45)

plt.subplot(2, 3, 6)
diagnosis_age = df.groupby('diagnosis')['age'].mean().sort_values()
diagnosis_age.plot(kind='bar', ax=plt.gca())
plt.title('Average Age by Diagnosis')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 3. Treatment Outcome Analysis

In [None]:
# Treatment outcome patterns
plt.figure(figsize=(18, 12))

# Outcome by treatment
plt.subplot(2, 3, 1)
outcome_treatment = pd.crosstab(df['recommended_treatment'], df['outcome'], normalize='index')
sns.heatmap(outcome_treatment, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Outcome Distribution by Treatment (Normalized)')
plt.xticks(rotation=45)

# Outcome by diagnosis
plt.subplot(2, 3, 2)
outcome_diagnosis = pd.crosstab(df['diagnosis'], df['outcome'], normalize='index')
sns.heatmap(outcome_diagnosis, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Outcome Distribution by Diagnosis (Normalized)')
plt.xticks(rotation=45)

# Outcome by severity
plt.subplot(2, 3, 3)
outcome_severity = pd.crosstab(df['severity'], df['outcome'], normalize='index')
sns.heatmap(outcome_severity, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Outcome Distribution by Severity (Normalized)')

# Treatment by diagnosis
plt.subplot(2, 3, 4)
treatment_diagnosis = pd.crosstab(df['diagnosis'], df['recommended_treatment'])
sns.heatmap(treatment_diagnosis, annot=True, fmt='d', cmap='Blues')
plt.title('Treatment Count by Diagnosis')
plt.xticks(rotation=45)

# Previous treatment impact
plt.subplot(2, 3, 5)
prev_treatment_outcome = pd.crosstab(df['previous_treatment'], df['outcome'], normalize='index')
sns.heatmap(prev_treatment_outcome, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Outcome by Previous Treatment (Normalized)')
plt.xticks(rotation=45)

# Gender impact on outcomes
plt.subplot(2, 3, 6)
gender_outcome = pd.crosstab(df['gender'], df['outcome'], normalize='index')
sns.heatmap(gender_outcome, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Outcome Distribution by Gender (Normalized)')

plt.tight_layout()
plt.show()

## 4. Statistical Analysis

In [None]:
# Chi-square tests for categorical associations
print("Statistical Tests for Categorical Associations:")
print("=" * 50)

categorical_vars = ['gender', 'diagnosis', 'severity', 'previous_treatment', 'recommended_treatment']

for var in categorical_vars:
    contingency_table = pd.crosstab(df[var], df['outcome'])
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    print(f"\n{var} vs Outcome:")
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Significant association: {'Yes' if p_value < 0.05 else 'No'}")

In [None]:
# Age analysis by outcome groups
print("\nAge Analysis by Outcome Groups:")
print("=" * 40)

for outcome in df['outcome'].unique():
    age_subset = df[df['outcome'] == outcome]['age']
    print(f"\n{outcome}:")
    print(f"  Mean age: {age_subset.mean():.2f}")
    print(f"  Median age: {age_subset.median():.2f}")
    print(f"  Std deviation: {age_subset.std():.2f}")

# ANOVA test for age differences across outcomes
outcome_groups = [df[df['outcome'] == outcome]['age'] for outcome in df['outcome'].unique()]
f_stat, p_value = stats.f_oneway(*outcome_groups)

print(f"\nANOVA Test (Age vs Outcome):")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Significant difference: {'Yes' if p_value < 0.05 else 'No'}")

## 5. Correlation Analysis

In [None]:
# Encode categorical variables for correlation analysis
preprocessor = DataPreprocessor()
categorical_cols = ['gender', 'symptoms', 'diagnosis', 'previous_treatment', 'severity', 'recommended_treatment', 'outcome']
df_encoded = preprocessor.encode_categorical_features(df, categorical_cols)

# Calculate correlation matrix
correlation_matrix = df_encoded.select_dtypes(include=[np.number]).corr()

# Plot correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
           cmap='coolwarm', center=0, square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of All Variables')
plt.tight_layout()
plt.show()

In [None]:
# Focus on correlations with outcome
outcome_correlations = correlation_matrix['outcome'].abs().sort_values(ascending=False)
print("Correlations with Treatment Outcome (absolute values):")
print("=" * 50)
for var, corr in outcome_correlations.items():
    if var != 'outcome':
        print(f"{var}: {corr:.4f}")

# Plot top correlations with outcome
plt.figure(figsize=(10, 6))
top_correlations = outcome_correlations.drop('outcome').head(8)
top_correlations.plot(kind='bar')
plt.title('Top Correlations with Treatment Outcome')
plt.xlabel('Variables')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Treatment Effectiveness Analysis

In [None]:
# Calculate treatment success rates
treatment_success = df.groupby('recommended_treatment')['outcome'].apply(
    lambda x: (x == 'Improved').sum() / len(x) * 100
).sort_values(ascending=False)

print("Treatment Success Rates (% Improved):")
print("=" * 40)
for treatment, rate in treatment_success.items():
    print(f"{treatment}: {rate:.1f}%")

# Plot treatment effectiveness
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
treatment_success.plot(kind='bar', color='skyblue')
plt.title('Treatment Success Rates (% Improved)')
plt.ylabel('Success Rate (%)')
plt.xticks(rotation=45)

# Treatment effectiveness by diagnosis
plt.subplot(2, 2, 2)
treatment_diagnosis_success = df.groupby(['diagnosis', 'recommended_treatment'])['outcome'].apply(
    lambda x: (x == 'Improved').sum() / len(x) * 100
).unstack(fill_value=0)

sns.heatmap(treatment_diagnosis_success, annot=True, fmt='.1f', cmap='RdYlGn')
plt.title('Treatment Success Rate by Diagnosis (%)')
plt.xlabel('Treatment')
plt.ylabel('Diagnosis')

# Treatment effectiveness by severity
plt.subplot(2, 2, 3)
treatment_severity_success = df.groupby(['severity', 'recommended_treatment'])['outcome'].apply(
    lambda x: (x == 'Improved').sum() / len(x) * 100
).unstack(fill_value=0)

sns.heatmap(treatment_severity_success, annot=True, fmt='.1f', cmap='RdYlGn')
plt.title('Treatment Success Rate by Severity (%)')
plt.xlabel('Treatment')
plt.ylabel('Severity')

# Age impact on treatment success
plt.subplot(2, 2, 4)
df['age_group'] = pd.cut(df['age'], bins=4, labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
age_treatment_success = df.groupby(['age_group', 'recommended_treatment'])['outcome'].apply(
    lambda x: (x == 'Improved').sum() / len(x) * 100
).unstack(fill_value=0)

sns.heatmap(age_treatment_success, annot=True, fmt='.1f', cmap='RdYlGn')
plt.title('Treatment Success Rate by Age Group (%)')
plt.xlabel('Treatment')
plt.ylabel('Age Group')

plt.tight_layout()
plt.show()

## 7. Patient Segmentation Analysis

In [None]:
# Create patient risk profiles
def create_risk_profile(row):
    risk_score = 0
    
    # Age factor
    if row['age'] > 65:
        risk_score += 2
    elif row['age'] > 50:
        risk_score += 1
    
    # Severity factor
    if row['severity'] == 'Severe':
        risk_score += 3
    elif row['severity'] == 'Moderate':
        risk_score += 1
    
    # Previous treatment factor
    if row['previous_treatment'] != 'None':
        risk_score += 1
    
    # Categorize risk
    if risk_score <= 1:
        return 'Low Risk'
    elif risk_score <= 3:
        return 'Medium Risk'
    else:
        return 'High Risk'

df['risk_profile'] = df.apply(create_risk_profile, axis=1)

# Analyze outcomes by risk profile
plt.figure(figsize=(15, 8))

plt.subplot(1, 3, 1)
risk_outcome = pd.crosstab(df['risk_profile'], df['outcome'], normalize='index')
sns.heatmap(risk_outcome, annot=True, fmt='.2f', cmap='RdYlGn_r')
plt.title('Outcome Distribution by Risk Profile')

plt.subplot(1, 3, 2)
df['risk_profile'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Risk Profile Distribution')
plt.ylabel('')

plt.subplot(1, 3, 3)
risk_success = df.groupby('risk_profile')['outcome'].apply(
    lambda x: (x == 'Improved').sum() / len(x) * 100
)
risk_success.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title('Success Rate by Risk Profile')
plt.ylabel('Success Rate (%)')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

print("Risk Profile Analysis:")
print("=" * 30)
for risk in df['risk_profile'].unique():
    subset = df[df['risk_profile'] == risk]
    success_rate = (subset['outcome'] == 'Improved').sum() / len(subset) * 100
    print(f"{risk}: {len(subset)} patients, {success_rate:.1f}% success rate")

## 8. Key Insights Summary

In [None]:
# Generate key insights
print("=" * 60)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

# Overall statistics
total_patients = len(df)
overall_success_rate = (df['outcome'] == 'Improved').sum() / total_patients * 100

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total patients: {total_patients:,}")
print(f"   - Overall success rate: {overall_success_rate:.1f}%")
print(f"   - Age range: {df['age'].min()}-{df['age'].max()} years")
print(f"   - Average age: {df['age'].mean():.1f} years")

# Treatment insights
best_treatment = treatment_success.index[0]
best_success_rate = treatment_success.iloc[0]
worst_treatment = treatment_success.index[-1]
worst_success_rate = treatment_success.iloc[-1]

print(f"\n2. TREATMENT EFFECTIVENESS:")
print(f"   - Most effective treatment: {best_treatment} ({best_success_rate:.1f}% success)")
print(f"   - Least effective treatment: {worst_treatment} ({worst_success_rate:.1f}% success)")
print(f"   - Treatment effectiveness varies by diagnosis and severity")

# Risk factors
high_risk_patients = (df['risk_profile'] == 'High Risk').sum()
high_risk_success = df[df['risk_profile'] == 'High Risk']['outcome'].apply(lambda x: x == 'Improved').mean() * 100

print(f"\n3. RISK FACTORS:")
print(f"   - High-risk patients: {high_risk_patients} ({high_risk_patients/total_patients*100:.1f}%)")
print(f"   - High-risk success rate: {high_risk_success:.1f}%")
print(f"   - Key risk factors: Age >65, Severe condition, Previous treatments")

# Correlations
top_3_correlations = outcome_correlations.drop('outcome').head(3)
print(f"\n4. STRONGEST PREDICTORS OF OUTCOME:")
for i, (var, corr) in enumerate(top_3_correlations.items(), 1):
    print(f"   {i}. {var}: {corr:.3f} correlation")

print(f"\n5. RECOMMENDATIONS FOR MODEL BUILDING:")
print(f"   - Focus on high-correlation features for feature selection")
print(f"   - Consider interaction effects between age, severity, and diagnosis")
print(f"   - Implement risk-based stratification in model training")
print(f"   - Account for treatment-diagnosis combinations")

print(f"\nNext Steps: Proceed to Feature Engineering notebook")

In [None]:
# Save EDA results
df.to_csv('../data/processed/eda_data.csv', index=False)
print("\nEDA results saved to '../data/processed/eda_data.csv'")