In [1]:
# Setup
import pandas as pd

noaa_fire_data_df = pd.read_csv('./RDS-2021-0100/Data/Fire_Weather_Data_2002-2014_2016.csv')

Question 1: According to NOAA, on average, how correlated are the size of a fire and the median wind speed in an area?

In [2]:
noaa_fire_july = noaa_fire_data_df[(noaa_fire_data_df['start_day_of_year'] >= 181) & (noaa_fire_data_df['start_day_of_year'] < 213)]
correlation = noaa_fire_july['wind_med'].corr(noaa_fire_july['hec'])
print(f"The correlation coefficient is {correlation}")
# The correlation is not strong

The correlation coefficient is -0.010026894689326483


Question 2: According to NOAA, is there a difference between the breakdown of causes of fires between geographical regions for fires that start in the months of January, February, and March?

In [3]:
noaa_fire_first_quarter = noaa_fire_data_df[(noaa_fire_data_df['start_day_of_year'] <= 92) & (noaa_fire_data_df['cause'] != 'U')] # Filters out unknown causes
regions = ['California', 'Great Basin', 'Northwest', 'Inland Empire', 'Rocky Mountains', 'Southwest']
breakdowns = []
for region in regions:
    print(f"Investigating {region}...")
    region_df = noaa_fire_first_quarter[noaa_fire_first_quarter['region'] == region]
    print(len(region_df))
    natural_percentage = (region_df['cause'] == 'N').mean() * 100
    lightning_percentage = (region_df['cause'] == 'L').mean() * 100
    human_percentage = (region_df['cause'] == 'H').mean() * 100
    other_percentage = (region_df['cause'] == 'O').mean() * 100
    print(natural_percentage, lightning_percentage, human_percentage, other_percentage)

Investigating California...
12
0.0 8.333333333333332 91.66666666666666 0.0
Investigating Great Basin...
2
0.0 0.0 100.0 0.0
Investigating Northwest...
3
0.0 0.0 100.0 0.0
Investigating Inland Empire...
7
14.285714285714285 0.0 85.71428571428571 0.0
Investigating Rocky Mountains...
0
nan nan nan nan
Investigating Southwest...
133
3.007518796992481 4.511278195488721 90.97744360902256 0.0


In [4]:
# Setup for joining with detailed weather data

weather_data_df = pd.read_csv('WeatherEvents_Jan2016-Dec2022.csv')
weather_station_data_df = pd.read_csv('PublicView_RAWS_-3515561676727363726.csv')


Question 3: What is the average elevation (in feet) of the weather stations used for fire site monitoring in the noaa dataset?

In [5]:
# Get unique weather stations and clean by converting to string and padding to 6 digits for joining.
firewatch_stations = weather_station_data_df['NWS ID'].dropna().unique().astype('str')
firewatch_stations_df = pd.DataFrame(firewatch_stations, columns=['firewatch_station_id'])
def pad_id(id_str):
    if isinstance(id_str, str) and id_str.isdigit() and len(id_str) == 6:
        return id_str
    elif isinstance(id_str, str):
        return id_str.zfill(6)
    else:
        return id_str
firewatch_stations_df['firewatch_station_id'] = firewatch_stations_df['firewatch_station_id'].apply(pad_id)
print(len(firewatch_stations_df))
# Join with NMS ID, the system needs to figure out that the joinable column from the RWAS stations dataset is NWS ID
merged_stations_df = pd.merge(firewatch_stations_df, weather_station_data_df, left_on='firewatch_station_id', right_on='NWS ID', how='left')
merged_stations_df = merged_stations_df.dropna()
print(merged_stations_df['Elevation'].mean())

2136
3902.1013071895427


Question 4: Controlling for the weather, do more aggressive suppression actually contribute to fire ending faster and affecting less buildings?

In [6]:
%pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Data cleaning
import numpy as np
noaa_fire_data_df['dominant_strategy_25_indicator'] = np.where(noaa_fire_data_df['dominant_strategy_25_s'] == "Full Suppression", 1, 0)
noaa_fire_data_df['dominant_strategy_50_indicator'] = np.where(noaa_fire_data_df['dominant_strategy_50_s'] == "Full Suppression", 1, 0)
noaa_fire_data_df['dominant_strategy_75_indicator'] = np.where(noaa_fire_data_df['dominant_strategy_75_s'] == "Full Suppression", 1, 0)

import statsmodels
import statsmodels.api as sm
noaa_fire_data_df = noaa_fire_data_df.dropna()
y = noaa_fire_data_df['duration']
x_25 = noaa_fire_data_df[['dominant_strategy_25_indicator', 'avrh_mean', 'wind_med', 'erc_med', 'rain_sum', 'hec']]
x_25 = sm.add_constant(x_25)
model = sm.OLS(y, x_25).fit(cov_type='HC3')
print(model.summary())

x_50 = noaa_fire_data_df[['dominant_strategy_50_indicator', 'avrh_mean', 'wind_med', 'erc_med', 'rain_sum', 'hec']]
x_50 = sm.add_constant(x_50)
model = sm.OLS(y, x_50).fit(cov_type='HC3')
print(model.summary())

x_75 = noaa_fire_data_df[['dominant_strategy_75_indicator', 'avrh_mean', 'wind_med', 'erc_med', 'rain_sum', 'hec']]
x_75 = sm.add_constant(x_75)
model = sm.OLS(y, x_75).fit(cov_type='HC3')
print(model.summary())

y_2 = noaa_fire_data_df['prim_threatened_aggregate']
model = sm.OLS(y_2, x_25).fit(cov_type='HC3')
print(model.summary())
model = sm.OLS(y_2, x_50).fit(cov_type='HC3')
print(model.summary())
model = sm.OLS(y_2, x_75).fit(cov_type='HC3')
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               duration   R-squared:                       0.607
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     302.7
Date:                Thu, 24 Apr 2025   Prob (F-statistic):          2.03e-313
Time:                        12:01:51   Log-Likelihood:                -14603.
No. Observations:                3521   AIC:                         2.922e+04
Df Residuals:                    3514   BIC:                         2.926e+04
Df Model:                           6                                         
Covariance Type:                  HC3                                         
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

Question 5: In 2016, what percentage of fires were brought under control with it raining in the fire area on the same or previous day?

In [8]:
%pip install timezonefinder
%pip install geopy

Note: you may need to restart the kernel to use updated packages.
Collecting geopy
  Using cached geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Using cached geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Using cached geopy-2.4.1-py3-none-any.whl (125 kB)
Using cached geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.


In [9]:
import datetime
import pytz
import timezonefinder

tf = timezonefinder.TimezoneFinder()
def get_timezone(lat, lng):
    return tf.timezone_at(lat=lat, lng=lng)
weather_data_2016_df = weather_data_df[weather_data_df['StartTime(UTC)'].str.startswith('2016')]
def convert_starttime_utc_to_local(row):
    utc_time = pd.to_datetime(row['StartTime(UTC)'], utc=True)
    local_timezone = pytz.timezone(row['TimeZone'])
    # local_time = datetime.datetime(utc_time, tzinfo=local_timezone)
    local_time = utc_time.astimezone(local_timezone)
    return local_time


weather_data_2016_df['StartTime(local)'] = weather_data_2016_df.apply(convert_starttime_utc_to_local, axis=1)

def convert_endtime_utc_to_local(row):
    utc_time = pd.to_datetime(row['EndTime(UTC)'], utc=True)
    local_timezone = pytz.timezone(row['TimeZone'])
    # local_time = datetime.datetime(utc_time, tzinfo=local_timezone)
    local_time = utc_time.astimezone(local_timezone)
    return local_time
weather_data_2016_df['EndTime(local)'] = weather_data_2016_df.apply(convert_endtime_utc_to_local, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_data_2016_df['StartTime(local)'] = weather_data_2016_df.apply(convert_starttime_utc_to_local, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_data_2016_df['EndTime(local)'] = weather_data_2016_df.apply(convert_endtime_utc_to_local, axis=1)


In [10]:
from dateutil import parser
weather_data_2016_df['start_day_of_the_year'] = weather_data_2016_df['StartTime(local)'].apply(lambda x: x.timetuple().tm_yday)
weather_data_2016_df['end_day_of_the_year'] = weather_data_2016_df['EndTime(local)'].apply(lambda x: x.timetuple().tm_yday)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_data_2016_df['start_day_of_the_year'] = weather_data_2016_df['StartTime(local)'].apply(lambda x: x.timetuple().tm_yday)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_data_2016_df['end_day_of_the_year'] = weather_data_2016_df['EndTime(local)'].apply(lambda x: x.timetuple().tm_yday)


In [None]:
import math
from geopy import distance

noaa_fire_data_2016_df = noaa_fire_data_df[noaa_fire_data_df['control_year'] == 2016]
noaa_fire_data_2016_df['control_day_previous_day'] = noaa_fire_data_2016_df['control_day_of_year'] - 1
noaa_fire_data_2016_df['rained'] = False


for idx, fire_row in noaa_fire_data_df.iterrows():
    control_day = fire_row['control_day_of_year']
    control_day_prev = control_day - 1

    # Query weather data for matching day or previous day
    weather_matches = weather_data_2016_df.query(
        "(start_day_of_the_year <= @control_day_prev and end_day_of_the_year >= @control_day_prev) or (start_day_of_the_year <= @control_day and end_day_of_the_year >= @control_day)"
    )

    if not weather_matches.empty:
        
        # Do something with each matched weather row
        for _, weather_row in weather_matches.iterrows():
            # Example: extract a field, compute a stat, or store it
            if weather_row['Type'] != "Rain":
                continue
            weather_location = (weather_row['LocationLat'], weather_row['LocationLng'])
            fire_location = (fire_row['latitude'], fire_row['longitude'])
            fire_area_sqkm = fire_row['hec'] / 100
            threshold = math.sqrt(fire_area_sqkm)
            dist_km = distance.distance(weather_location, fire_location).km
            if dist_km <= fire_area_sqkm and weather_row['Precipitation(in)'] > 0.05:
                noaa_fire_data_2016_df.at[idx, 'rained'] = True

q5_percentage = (noaa_fire_data_2016_df['rained'].mean()) * 100
print(f"The percentage of 2016 fire that got under control with the help of a rain is {q5_percentage:.2f}%")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noaa_fire_data_2016_df['control_day_previous_day'] = noaa_fire_data_2016_df['control_day_of_year'] - 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noaa_fire_data_2016_df['rained'] = False
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noaa_fire_data_2016_df.at[idx, 'rained'] = True


The percentage of 2016 fire that got under control with the help of a rain is 0.21%
