In [None]:
import pandas as pd
# Verify X and y and train model robustly

In [5]:
df = pd.read_csv(r'C:\Users\hiyan\OneDrive\Desktop\Projects\deepdata\flood_risk_dataset.csv')

In [15]:
risk_mapping = {'low_risk': 1, 'medium_risk': 2, 'high_risk': 3, 'low_lying_flooding': 4}
df['risk_score'] = df['risk_labels'].map(risk_mapping)

In [27]:
# Optional: recreate a fallback target by filling inputs and recomputing hazard_index
import pandas as pd

if 'df' not in globals():
    raise NameError("df is not defined. Run the CSV load cell first.")

print('Before fill, hazard_index non-null:', df['hazard_index'].notna().sum() if 'hazard_index' in df.columns else 'no hazard_index')

# Fill elevation and rainfall with medians if present
for col in ['elevation_m','historical_rainfall_intensity_mm_hr']:
    if col in df.columns:
        nnull = df[col].notna().sum()
        if nnull < len(df):
            med = df[col].median(skipna=True)
            df[col] = df[col].fillna(med)
            print(f"Filled missing {col} with median = {med}")

if 'elevation_m' in df.columns and 'historical_rainfall_intensity_mm_hr' in df.columns:
    df['hazard_index'] = df['elevation_m'] * df['historical_rainfall_intensity_mm_hr']
    print('Recomputed hazard_index; non-null:', df['hazard_index'].notna().sum())

# Create fallback categorical target from hazard_index tertiles if risk_labels missing
if 'risk_labels' not in df.columns:
    if 'hazard_index' in df.columns and df['hazard_index'].notna().sum() > 0:
        df['risk_auto'] = pd.qcut(df['hazard_index'].rank(method='first'), q=3, labels=['low_risk','medium_risk','high_risk'])
        print("Created 'risk_auto' target from hazard_index tertiles. Non-null:", df['risk_auto'].notna().sum())
    else:
        print("Cannot create fallback target: no usable hazard_index")

print('\nAfter changes, target availability:')
for t in ['risk_labels','risk_score','risk_auto']:
    print(t, '->', (df[t].notna().sum() if t in df.columns else 'not present'))

Before fill, hazard_index non-null: 2802
Filled missing elevation_m with median = 25.130000000000003
Recomputed hazard_index; non-null: 2963
Created 'risk_auto' target from hazard_index tertiles. Non-null: 2963

After changes, target availability:
risk_labels -> not present
risk_score -> 0
risk_auto -> 2963


In [30]:
# Robust training cell: uses available target and trains only if samples exist
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

if 'df' not in globals():
    raise NameError("df is not defined. Run the CSV load cell first.")

# Show current target availability
for t in ['risk_labels','risk_score','risk_auto']:
    print(f"{t}: present={t in df.columns}, non-null={(df[t].notna().sum() if t in df.columns else 0)}")

# Repair risk_labels if it's multi-label (pipe-separated) by choosing the first meaningful label
if 'risk_labels' in df.columns:
    # Convert empty strings to NaN
    df['risk_labels'] = df['risk_labels'].replace({'': pd.NA})
    # If entries contain pipes, pick the first token (most-specific may be first)
    def pick_label(val):
        if pd.isna(val):
            return val
        if '|' in str(val):
            # preference map: map known keywords to canonical classes
            parts = [p.strip() for p in str(val).split('|') if p.strip()]
            # mapping examples
            for pref in ['extreme_rain_history','ponding_hotspot','low_lying','event','sparse_drainage']:
                for p in parts:
                    if pref in p:
                        return pref
            # otherwise return first part
            return parts[0]
        return val
    df['risk_labels'] = df['risk_labels'].apply(pick_label)
    print('After normalizing, risk_labels non-null:', df['risk_labels'].notna().sum())

# If no usable target, try to build risk_auto from hazard_index
if not any(c in df.columns and df[c].notna().sum() > 0 for c in ['risk_labels','risk_score','risk_auto']):
    print('No existing labeled target found; attempting to build `risk_auto` from inputs...')
    # Fill numeric inputs with medians if missing
    for col in ['elevation_m','historical_rainfall_intensity_mm_hr']:
        if col in df.columns:
            med = df[col].median(skipna=True)
            df[col] = df[col].fillna(med)
            print(f'Filled {col} with median={med}')
    if 'elevation_m' in df.columns and 'historical_rainfall_intensity_mm_hr' in df.columns:
        df['hazard_index'] = df['elevation_m'] * df['historical_rainfall_intensity_mm_hr']
        # create tertiles
        df['risk_auto'] = pd.qcut(df['hazard_index'].rank(method='first'), q=3, labels=['low_risk','medium_risk','high_risk'])
        print('Created risk_auto from hazard_index; non-null:', df['risk_auto'].notna().sum())

# Pick target
if 'risk_labels' in df.columns and df['risk_labels'].notna().sum() > 0:
    target_col = 'risk_labels'
elif 'risk_score' in df.columns and df['risk_score'].notna().sum() > 0:
    target_col = 'risk_score'
elif 'risk_auto' in df.columns and df['risk_auto'].notna().sum() > 0:
    target_col = 'risk_auto'
else:
    raise KeyError('No target column available after repair attempts. Inspect the dataset.')

print('Selected target:', target_col)

y = df[target_col].copy()
X = df.drop(columns=[c for c in ['risk_labels','risk_score','risk_auto'] if c in df.columns])

# Drop missing target rows
mask = y.notna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()
print('Samples available for training after mask:', len(y))
if len(y) == 0:
    raise ValueError('No samples after dropping missing target even after repair attempts')

# If numeric risk_score, bin into categories for classification
if target_col == 'risk_score' and y.dtype.kind in 'if':
    y = pd.qcut(y.rank(method='first'), q=3, labels=['low_risk','medium_risk','high_risk'])
    print('Binned numeric risk_score into categorical target')

# One-hot encode categorical features
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
if len(categorical_cols) > 0:
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
else:
    X_encoded = X.copy()

# Convert remaining object dtypes to numeric codes
for col in X_encoded.columns:
    if X_encoded[col].dtype == 'O':
        X_encoded[col] = X_encoded[col].astype('category').cat.codes

# Decide stratify argument safely
stratify_arg = None
if y.nunique() >= 2:
    vc = y.value_counts()
    if (vc >= 2).all():
        stratify_arg = y
    else:
        print('Not stratifying: at least one class has <2 samples')

print('X_encoded shape:', X_encoded.shape)
print('y distribution:\n', y.value_counts())

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=stratify_arg)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print('\nClassification report:')
print(classification_report(y_test, y_pred))

fi = pd.Series(model.feature_importances_, index=X_encoded.columns).sort_values(ascending=False)
print('\nTop 10 feature importances:\n', fi.head(10))

risk_labels: present=False, non-null=0
risk_score: present=True, non-null=0
risk_auto: present=True, non-null=2963
Selected target: risk_auto
Samples available for training after mask: 2963
X_encoded shape: (2963, 3318)
y distribution:
 risk_auto
low_risk       988
high_risk      988
medium_risk    987
Name: count, dtype: int64

Classification report:
              precision    recall  f1-score   support

   high_risk       0.99      1.00      1.00       198
    low_risk       0.99      1.00      0.99       198
 medium_risk       1.00      0.98      0.99       197

    accuracy                           0.99       593
   macro avg       0.99      0.99      0.99       593
weighted avg       0.99      0.99      0.99       593


Top 10 feature importances:
 hazard_index                           0.326372
elevation_m                            0.192974
historical_rainfall_intensity_mm_hr    0.037630
longitude                              0.021636
latitude                               0.02

In [33]:
import folium

# Determine which target column to use for visualization
if 'risk_labels' in df.columns and df['risk_labels'].notna().sum() > 0:
    viz_col = 'risk_labels'
elif 'risk_auto' in df.columns and df['risk_auto'].notna().sum() > 0:
    viz_col = 'risk_auto'
elif 'risk_score' in df.columns and df['risk_score'].notna().sum() > 0:
    viz_col = 'risk_score'
else:
    viz_col = None
    print('Warning: no risk label column available; map will use gray markers')

# Create a base map centered at the average location of the data points
map_center = [df['latitude'].mean(), df['longitude'].mean()]
flood_map = folium.Map(location=map_center, zoom_start=12)

# Define a color scheme for the risk labels
color_map = {
    'low_risk': 'green',
    'medium_risk': 'orange',
    'high_risk': 'red',
    'low_lying_flooding': 'darkred'
}

# Add markers for each data point safely
for index, row in df.iterrows():
    label = None if viz_col is None else row.get(viz_col, None)
    # If label is numeric risk_score, bin it for colors
    if viz_col == 'risk_score' and label is not None:
        try:
            # create a temporary binning
            if float(label) <= df['risk_score'].quantile(0.33):
                label = 'low_risk'
            elif float(label) <= df['risk_score'].quantile(0.66):
                label = 'medium_risk'
            else:
                label = 'high_risk'
        except Exception:
            label = None

    risk_color = color_map.get(label, 'gray')

    # Safe numeric formatting with fallbacks
    def fmt_float(val):
        try:
            return f"{float(val):.2f}"
        except Exception:
            return 'N/A'

    elevation = fmt_float(row.get('elevation_m', None))
    rainfall = fmt_float(row.get('historical_rainfall_intensity_mm_hr', None))
    drainage = fmt_float(row.get('drainage_density_km_per_km2', None))
    land_use = row.get('land_use', 'N/A')

    popup_text = (
        f"<b>Risk:</b> {label if label is not None else 'unknown'}<br>"
        f"<b>Elevation:</b> {elevation}m<br>"
        f"<b>Rainfall:</b> {rainfall}mm/hr<br>"
        f"<b>Drainage Density:</b> {drainage}<br>"
        f"<b>Land Use:</b> {land_use}"
    )

    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        color=risk_color,
        fill=True,
        fill_color=risk_color,
        fill_opacity=0.7,
        tooltip=str(label) if label is not None else None,
        popup=popup_text
    ).add_to(flood_map)

# Save the map to an HTML file
flood_map.save("urban_flood_risk_dashboard.html")
print("Interactive dashboard saved as urban_flood_risk_dashboard.html")

Interactive dashboard saved as urban_flood_risk_dashboard.html
