Import Libraries

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point
import matplotlib.pyplot as plt

# Display all columns to see data clearly
pd.set_option('display.max_columns', None)

print("Libraries loaded successfully!")

Libraries loaded successfully!


Load Fire Data

In [2]:
# Cell 2: Load the Fire Data (Robust Version)
# -------------------------------------------
file_path = r'C:\Users\pc\Desktop\ml assignment\Forest_Fire_Nepal\Data\raw\modis_nepal_2012_2016.csv'

df = pd.read_csv(file_path)

# FIX: Clean up column names (force lowercase, remove spaces)
# This turns "ACQ_DATE" or " acq_date " into "acq_date"
df.columns = df.columns.str.strip().str.lower()

# Debugging: Print the columns so you can see them
print("Found Columns:", df.columns.tolist())

# Now this line should work safely
if 'acq_date' in df.columns:
    df['acq_date'] = pd.to_datetime(df['acq_date'])
    print(f"Step 1: Raw Data Loaded. Total Rows: {len(df)}")
else:
    print("ERROR: Still can't find 'acq_date'. Please check the 'Found Columns' list above.")

Found Columns: ['objectid', 'latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'confidence', 'version', 'bright_t31', 'frp', 'wardnumber', 'district', 'zone', 'vdc', 'pa', 'gridid', 'elevation', 'slope', 'lccode', 'landcover']
Step 1: Raw Data Loaded. Total Rows: 13781


Filter By Confidence

In [3]:
# Filter out low-confidence fires (noise)
# We only keep fires with >= 50% confidence
df_clean = df[df['confidence'] >= 50].copy()

print(f"Step 2: Low confidence data removed. Remaining Rows: {len(df_clean)}")

Step 2: Low confidence data removed. Remaining Rows: 11291


Create Geospatial Geometry

In [4]:
# Convert Latitude and Longitude into "Points"
geometry = [Point(xy) for xy in zip(df_clean['longitude'], df_clean['latitude'])]
geo_df = gpd.GeoDataFrame(df_clean, geometry=geometry)

# Define the coordinate system (WGS84)
geo_df.set_crs(epsg=4326, inplace=True)

print("Step 3: Converted to Geospatial Data.")

Step 3: Converted to Geospatial Data.


Load Nepal Map

In [5]:

import os
import geopandas as gpd

# 1. Define path
data_folder = r'C:\Users\pc\Desktop\ml assignment\Forest_Fire_Nepal\Data\raw'
target_file = os.path.join(data_folder, 'nepal_districts.shp')

if os.path.exists(target_file):
    print(f"Loading map from: {target_file}")
    nepal_map = gpd.read_file(target_file)
    
    # 2. FIX: Check if the map has "Amnesia" (Missing CRS)
    if nepal_map.crs is None:
        print("WARNING: Map has no Coordinate System (missing .prj file).")
        print(" -> Manually setting CRS to WGS84 (EPSG:4326).")
        # We use set_crs instead of to_crs. 
        # set_crs says: "Trust me, this IS 4326."
        nepal_map.set_crs(epsg=4326, inplace=True)
    else:
        # If it has one, ensure it matches our standard
        nepal_map = nepal_map.to_crs(epsg=4326)
        
    print("SUCCESS: Map loaded and CRS is valid!")
else:
    print(f"ERROR: Could not find {target_file}")


Loading map from: C:\Users\pc\Desktop\ml assignment\Forest_Fire_Nepal\Data\raw\nepal_districts.shp
 -> Manually setting CRS to WGS84 (EPSG:4326).
SUCCESS: Map loaded and CRS is valid!


Optimized Spatial Join

In [6]:
# -----------------------------------------
# FIX: Simplify the map geometry to speed up calculation.
# This prevents the "Interrupting Kernel" error.
# -----------------------------------------
nepal_map['geometry'] = nepal_map.geometry.simplify(tolerance=0.001, preserve_topology=True)

# Perform the Spatial Join (Keep only points strictly inside Nepal)
nepal_fires = gpd.sjoin(geo_df, nepal_map, how="inner", predicate="within")

print(f"Step 5: Spatially Filtered. Final Valid Fires: {len(nepal_fires)}")

Step 5: Spatially Filtered. Final Valid Fires: 11291


Feature Engineering

In [7]:
# 1. Extract time info for analysis
nepal_fires['month'] = nepal_fires['acq_date'].dt.month
nepal_fires['year'] = nepal_fires['acq_date'].dt.year

# 2. Convert to Radians (CRITICAL for HDBSCAN later)
nepal_fires['lat_rad'] = np.radians(nepal_fires['latitude'])
nepal_fires['lon_rad'] = np.radians(nepal_fires['longitude'])

print("Step 6: Features Created (Month, Year, Radians).")

Step 6: Features Created (Month, Year, Radians).


Save The Clean Data

In [8]:
import os

# Cell 8: Save to CSV (Fixed for your folder structure)
# -------------------
# We check where the 'data' folder is before saving to avoid the error.

if os.path.exists('data'):
    # If running from the main project folder
    output_path = 'data/cleaned_fires_nepal.csv'
elif os.path.exists('../data'):
    # If running from inside the notebooks folder
    output_path = '../data/cleaned_fires_nepal.csv'
else:
    # Fallback: Create a data folder if it's missing
    os.makedirs('data', exist_ok=True)
    output_path = 'data/cleaned_fires_nepal.csv'

# Save the file
nepal_fires.to_csv(output_path, index=False)

print(f"SUCCESS: Preprocessing complete. Saved to: {output_path}")

SUCCESS: Preprocessing complete. Saved to: data/cleaned_fires_nepal.csv
