In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr
import os
from pathlib import Path

# Set up paths
current_dir = Path.cwd()
figures_dir = current_dir.parent/"figures"
os.makedirs(figures_dir, exist_ok=True)

# Load data with proper type handling
df = pd.read_csv("../data/processed/transformed_flood_data.csv")

# 1. PREPROCESSING --------------------------------
# Select only numeric columns for correlation
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Verify we have expected columns
expected_cols = ['Rainfall_mm', 'Drainage_Efficiency', 'Urbanization_Rate', 
                 'Flood_Occurrence', 'Flood_Risk_Index']
missing_cols = [col for col in expected_cols if col not in numeric_cols]
if missing_cols:
    print(f"⚠️ Missing numeric columns: {missing_cols}")

# 2. CORRELATION MATRIX (NUMERIC ONLY) ------------
plt.figure(figsize=(12, 10))
corr_matrix = df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle

sns.heatmap(corr_matrix, 
            annot=True, 
            fmt=".2f", 
            cmap="coolwarm", 
            vmin=-1, 
            vmax=1,
            mask=mask)
plt.title("Numeric Variable Correlation Matrix")
plt.tight_layout()
plt.savefig(figures_dir/"correlation_matrix.png", bbox_inches='tight')
plt.close()

# 3. SCATTERPLOT MATRIX --------------------------
# Convert Flood_Occurrence to categorical if numeric
if pd.api.types.is_numeric_dtype(df['Flood_Occurrence']):
    df['Flood_Occurrence'] = df['Flood_Occurrence'].astype('category')

sns.pairplot(df[numeric_cols], 
             hue='Flood_Occurrence', 
             diag_kind='kde',
             plot_kws={'alpha': 0.6})
plt.suptitle("Multimodal Relationships", y=1.02)
plt.savefig(figures_dir/"scatterplot_matrix.png", bbox_inches='tight')
plt.close()

# 4. FLOOD RISK ANALYSIS -------------------------
plt.figure(figsize=(12, 6))
sns.boxplot(x='Flood_Occurrence', 
            y='Flood_Risk_Index', 
            data=df,
            showfliers=False)  # Remove outliers for clearer visualization
plt.title("Flood Risk Index vs. Actual Flood Occurrence")
plt.tight_layout()
plt.savefig(figures_dir/"flood_risk_comparison.png")
plt.close()

# 5. STATE-WISE ANALYSIS -------------------------
if 'State' in df.columns:
    state_risk = df.groupby('State')['Flood_Risk_Index'].mean().sort_values(ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x=state_risk.head(10).values, 
                y=state_risk.head(10).index,
                palette="Blues_d")
    plt.title("Top 10 Flood-Prone States")
    plt.xlabel("Average Flood Risk Index")
    plt.tight_layout()
    plt.savefig(figures_dir/"top_flood_states.png")
    plt.close()
else:
    print("⚠️ 'State' column not found - skipping state-wise analysis")

print(f"✅ Analysis complete. Figures saved to: {figures_dir}")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=state_risk.head(10).values,


✅ Analysis complete. Figures saved to: /Users/raheeminioluwa/Documents/Flood-EDA-Nigeria/figures
