In [None]:
pip install category_encoders

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn

sklearn.set_config(transform_output="pandas")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
fires = pd.read_csv('/content/drive/MyDrive/6201 Project/Wildfires with Weather Data 2015 to 2025 no cloudiness.csv')

In [None]:
fires

In [None]:
print("Number of rows:", len(fires))

In [None]:
missing_rows = fires[fires.isnull().any(axis=1)]
print(f"Rows with missing values: {len(missing_rows)}")

missing_per_column = fires.isnull().sum()
print(missing_per_column)

In [None]:
fires = fires.drop(columns=["DISCOVERY_TIME"])

fires.to_csv("filtered_fires.csv", index=False)

In [None]:
# Count of each fire size class
size_class_counts = fires['FIRE_SIZE_CLASS'].value_counts().sort_index()

# Plot
plt.figure(figsize=(8, 5))
size_class_counts.plot(kind='bar')
plt.title('Distribution of Fire Size Classes (All Fires)')
plt.xlabel('Fire Size Class')
plt.ylabel('Number of Fires')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
fires['FIRE_SIZE'].hist(bins=50)
plt.title('Distribution of Fire Sizes')
plt.xlabel('Fire Size (acres)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
fires['FIRE_SIZE'].apply(lambda x: np.log1p(x)).hist(bins=50)
plt.title('Log-Transformed Distribution of Fire Sizes')
plt.xlabel('log(1 + Fire Size in Acres)')
plt.ylabel('Frequency')
plt.show()

In [None]:
fires_per_year = fires['FIRE_YEAR'].value_counts().sort_index()
fires_per_year.plot(kind='bar', title='Fires Per Year', figsize=(10,5))

In [None]:
fires_by_state = fires['STATE'].value_counts()
fires_by_state.plot(kind='bar', title='Fires by State', figsize=(12,5))

In [None]:
import seaborn as sns
plt.figure(figsize=(10,5))
sns.histplot(fires['DISCOVERY_DOY'], bins=52, kde=True)
plt.title("Wildfire Discoveries by Day of Year")
plt.xlabel("Day of Year")
plt.ylabel("Number of Fires")
plt.show()

In [None]:
# plt.figure(figsize=(10,6))
# plt.scatter(fires['LONGITUDE'], fires['LATITUDE'], alpha=0.05, s=1)
# plt.title("Geographic Distribution of Wildfires")
# plt.xlabel("Longitude")
# plt.ylabel("Latitude")
# plt.show()

# JM - added some features
# plot
import matplotlib.pyplot as plt

# data
import pandas as pd
import geopandas as gpd
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

# world data (Replace with your path)
world = gpd.read_file(
    "ne_110m_admin_0_countries.shp"
)

# filter on USA
us = world[world['CONTINENT'] == 'North America']
usa = us[us['NAME'] == 'United States of America']

fig, ax = plt.subplots(figsize=(10, 10))
ax.set_xlim(-170, -65)
ax.set_ylim(24, 72)

# background map
usa.plot(ax=ax,
         color='#88b394',
         edgecolor='black', # colors
         linewidth=0.5 # size and edge width
          )
plt.title("Geographic Distribution of Wildfires")

# plot eclipses
for i,df in enumerate(fires):
    ax.scatter(
        fires['LONGITUDE'],
        fires['LATITUDE'],
        color='red',
        s=0.01
    )

In [None]:
import numpy as np
plt.figure(figsize=(10,5))
sns.histplot(np.log(fires['FIRE_SIZE'])+1, bins=50)
plt.title("Distribution of Fire Size (Log Scale)")
plt.xlabel("Log(Fire Size + 1)")
plt.ylabel("Frequency")
plt.show()

In [None]:
cause_counts = fires['NWCG_GENERAL_CAUSE'].value_counts()

cause_counts.plot(kind='bar', figsize=(10,5), title='Wildfires by General Cause')
plt.xlabel("Cause")
plt.ylabel("Number of Fires")
plt.xticks(rotation=45)
plt.show()

In [None]:
filtered_fires = fires[fires['NWCG_GENERAL_CAUSE'].notna()]

cause_by_year = filtered_fires.groupby(['FIRE_YEAR', 'NWCG_GENERAL_CAUSE']).size().unstack().fillna(0)

cause_by_year.plot(kind='area', stacked=True, figsize=(12,6))
plt.title("Trends in Fire Causes Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Fires")
plt.legend(title='General Cause', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
top_states = fires['STATE'].value_counts().head(10)
top_states.plot(kind='bar', title='Top 10 States by Number of Fires', figsize=(10,5))
plt.xlabel("State")
plt.ylabel("Number of Fires")
plt.show()

In [None]:
fires = pd.read_csv('filtered_fires.csv')
df = fires

# Count occurrences of each state within each fire size class
fire_counts = df.groupby(['FIRE_SIZE_CLASS', 'FIRE_YEAR']).size().reset_index(name='COUNT')

# Ensure we have exactly 3 top states for each fire class
# This ensures consistent bar counts for all groups
top_states_data = []
fire_classes = sorted(df['FIRE_SIZE_CLASS'].unique())

for fire_class in fire_classes:
    class_data = fire_counts[fire_counts['FIRE_SIZE_CLASS'] == fire_class]
    top3 = class_data.head(7)

    # Add exactly 3 states for this class
    for _, row in top3.iterrows():
        top_states_data.append({
            'FIRE_SIZE_CLASS': fire_class,
            'FIRE_YEAR': row['FIRE_YEAR'],
            'COUNT': row['COUNT']
        })

# Create dataframe for plotting
plot_df = pd.DataFrame(top_states_data)

# Set up the figure with a less vibrant background
plt.figure(figsize=(8, 6))
plt.rcParams.update({'figure.facecolor': '#f8f8f8', 'axes.facecolor': '#f8f8f8'})

# Use subdued colors for the bars
# Define a subdued color palette
subdued_colors = [
    '#8da290',  # muted sage green
    '#c0a98e',  # taupe
    '#7a94ab',  # dusty blue
    '#9c8aa5',  # muted lavender
    '#8b9e84',  # olive
    '#b18e92',  # dusty rose
    '#a0a0a0',  # gray
    '#c9b27c',  # sand
    '#7d929e',  # slate
    '#94867d',  # warm gray
    '#8e8ca3',  # dusty purple
    '#ad9d7f'   # khaki
]

fire_classes = sorted(plot_df['FIRE_SIZE_CLASS'].unique())
states_per_class = {}

for fire_class in fire_classes:
    class_data = plot_df[plot_df['FIRE_SIZE_CLASS'] == fire_class]
    states_per_class[fire_class] = list(class_data['FIRE_YEAR'])

# Get all unique states for coloring
all_states = sorted(set(plot_df['FIRE_YEAR']))
state_colors = dict(zip(all_states, subdued_colors[:len(all_states)]))

# Set width and positions
bar_width = 0.12
index = np.arange(len(fire_classes))

# Plot each state's bars
for i, state_position in enumerate(range(7)):  # Always 3 top states
    state_counts = []
    state_labels = []

    for fire_class in fire_classes:
        class_data = plot_df[plot_df['FIRE_SIZE_CLASS'] == fire_class]
        if len(class_data) > i:  # Ensure we have this position
            row = class_data.iloc[i]
            state_counts.append(row['COUNT'])
            state_labels.append(row['FIRE_YEAR'])
        else:
            state_counts.append(0)
            state_labels.append(None)

    # Assign positions for this group of bars
    x_positions = index - bar_width + (i * bar_width)

    # Plot with consistent colors based on state name
    for j, (count, state) in enumerate(zip(state_counts, state_labels)):
        if state:  # Only plot if there's a state
            plt.bar(
                x_positions[j],
                count,
                width=bar_width,
                color=state_colors[state],
                edgecolor='white',
                linewidth=0.5,
                label=state if state not in plt.gca().get_legend_handles_labels()[1] else ""
            )

            # Add data label
            plt.text(
                x_positions[j],
                count + 5,
                f"{int(count)}",
                ha='center',
                va='bottom',
                fontsize=10,
                color='#505050'
            )

# Remove duplicate labels from legend
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(
    by_label.values(),
    by_label.keys(),
    title='Year',
    fontsize=12,
    title_fontsize=14,
    loc='upper right'
)

# Customize the plot appearance
plt.title('Number of Fires per Year by Severity', fontsize=18, pad=20, color='#404040')
plt.xlabel('Fire Size Class (D=Smaller to G=Largest)', fontsize=14, labelpad=10, color='#505050')
plt.ylabel('Number of Fires', fontsize=14, labelpad=10, color='#505050')
plt.xticks(index, fire_classes, color='#505050')
plt.yticks(color='#505050')
plt.grid(axis='y', linestyle='--', alpha=0.3, color='#909090')

# Add a subtle border
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_color('#d0d0d0')
plt.gca().spines['bottom'].set_color('#d0d0d0')

# Tighten the layout
plt.tight_layout()

# Display the figure
plt.show()

# Print numerical results
# print("Top 3 States for Each Fire Size Class:")
# for fire_class in fire_classes:
#     print(f"\nFire Size Class {fire_class}:")
#     class_data = plot_df[plot_df['FIRE_SIZE_CLASS'] == fire_class]
#     for _, row in class_data.iterrows():
#         print(f"  {row['FIRE_YEAR']}: {int(row['COUNT'])} fires")

In [None]:
fires1 = pd.read_csv('filtered_fires.csv')

state_areas = {
    'AK': 570641, 'AL': 52420, 'AR': 52035, 'AZ': 113594, 'CA': 155779,
    'CO': 103642, 'CT': 4845,  'DE': 1949,  'FL': 53625, 'GA': 57513,
    'HI': 6423,   'IA': 55857, 'ID': 82743, 'IL': 55584, 'IN': 35826,
    'KS': 81814,  'KY': 39728, 'LA': 43562, 'MA': 7800,  'MD': 9775,
    'ME': 30843,  'MI': 56804, 'MN': 79627, 'MO': 68886, 'MS': 46923,
    'MT': 145546, 'NC': 53819, 'ND': 68976, 'NE': 76824, 'NH': 8953,
    'NJ': 7417,   'NM': 121365,'NV': 109781,'NY': 47214, 'OH': 40861,
    'OK': 68667,  'OR': 95988, 'PA': 44817, 'RI': 1034,  'SC': 30061,
    'SD': 75811,  'TN': 41235, 'TX': 261232,'UT': 82170, 'VA': 39594,
    'VT': 9249,   'WA': 66544, 'WI': 54310, 'WV': 24038, 'WY': 97093
}

area_df = pd.DataFrame(state_areas.items(), columns=['STATE', 'SQ_MILES'])

fire_counts = fires1['STATE'].value_counts().reset_index()
fire_counts.columns = ['STATE', 'FIRE_COUNT']

fire_stats = pd.merge(fire_counts, area_df, on='STATE')
fire_stats['FIRES_PER_1000_SQMI'] = fire_stats['FIRE_COUNT'] / fire_stats['SQ_MILES'] * 1000

# Get top 10
top_10_states = fire_stats.sort_values(by='FIRES_PER_1000_SQMI', ascending=False).head(10)

print(top_10_states[['STATE', 'FIRE_COUNT', 'SQ_MILES', 'FIRES_PER_1000_SQMI']])

top_10_states.plot(x='STATE', y='FIRES_PER_1000_SQMI', kind='bar', figsize=(10,5), title='Top 10 States by Fires per 1000 SQ Miles')

In [None]:
fires = pd.read_csv('filtered_fires.csv')

In [None]:
nc_fires = pd.read_csv('filtered_fires.csv')

nc_firess = nc_fires[nc_fires['STATE'] == 'NC'].copy()

plt.figure(figsize=(10,5))
nc_fires['FIRE_SIZE'].hist(bins=40)
plt.title("Distribution of Fire Sizes in NC")
plt.xlabel("Fire Size (acres)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(10,5))
np.log(nc_fires['FIRE_SIZE']+1).hist(bins=40)
plt.title("Log-Transformed Fire Size Distribution in NC")
plt.xlabel("log(1 + Fire Size)")
plt.ylabel("Frequency")
plt.show()


In [None]:
fires_per_year = nc_fires['FIRE_YEAR'].value_counts().sort_index()

# Plot
plt.figure(figsize=(10, 5))
fires_per_year.plot(kind='bar')
plt.title('Wildfires in North Carolina by Year')
plt.xlabel('Year')
plt.ylabel('Number of Fires')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
cause_counts = nc_fires['NWCG_GENERAL_CAUSE'].dropna().value_counts()

# Plot
plt.figure(figsize=(8, 5))
cause_counts.plot(kind='bar')
plt.title('Wildfires in NC by Cause Classification')
plt.xlabel('Cause Classification')
plt.ylabel('Number of Fires')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Model Build

In [None]:
#import models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm

In [None]:
fires = pd.read_csv('/content/drive/MyDrive/6201 Project/Wildfires with Weather Data 2015 to 2025 no cloudiness.csv')

fires.columns

In [None]:
plt.figure(figsize=(6,2))
nc_fires['Precipitation_Total'].hist(bins=40)
plt.title("Distribution of Precipitation_Total")
plt.xlabel("Fire Size (acres)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(6,2))
nc_fires['Cloudiness_Average'].hist(bins=20)
plt.title("Distribution of Cloudiness Averages")
plt.xlabel("Cloudiness Average (Whole Number Percentage)")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(6,2))
nc_fires['Temperature_Average'].hist(bins=40)
plt.title("Distribution of Temperature Averages")
plt.xlabel("Average Temperature")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(6,2))
nc_fires['Temperature_Max'].hist(bins=40)
plt.title("Distribution of Temperature Maximums")
plt.xlabel("Maximum Temperature")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Label target column
fires['LARGE_FIRE'] = fires['FIRE_SIZE_CLASS'].isin(['F', 'G']).astype(int)

#conversion to datetime
fires['DISCOVERY_DATE'] = pd.to_datetime(fires['DISCOVERY_DATE'], errors='coerce')

#create month column
fires['DISCOVERY_MONTH'] = fires['DISCOVERY_DATE'].dt.month

In [None]:
features = ['DISCOVERY_MONTH',
            'NWCG_GENERAL_CAUSE',
            'Precipitation_Total',
            'Temperature_Average',
            'Cloudiness_Average',
            'Temperature_Max']
target = 'LARGE_FIRE'

In [None]:
X = fires[features]
y = fires[target]

# Preprocessing
categorical = ['NWCG_GENERAL_CAUSE', 'DISCOVERY_MONTH']
numeric = ['Precipitation_Total', 'Temperature_Average', 'Cloudiness_Average', 'Temperature_Max']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric)  # scale numeric columns
])

lg_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=5000))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Fit and predict
lg_model.fit(X_train, y_train)
y_pred = lg_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

Imbalanced Data Issue - trying random forest below

In [None]:
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',  # handles class imbalance
        random_state=42
    ))
])

# Train/test split
X = fires[features]
y = fires[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Train and predict
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))

In [None]:
# feature performance

# Access the Random Forest model from the pipeline
rf_modell = rf_model.named_steps['classifier']

# Get the OneHotEncoder from the preprocessor
ohe = rf_model.named_steps['preprocessor'].named_transformers_['cat']

# Get feature names from one-hot encoded categorical features
encoded_cat_names = ohe.get_feature_names_out(categorical)

# Combine with numeric feature names
feature_names = list(encoded_cat_names) + numeric

# Get feature importances
importances = rf_modell.feature_importances_
indices = sorted(range(len(importances)), key=lambda i: importances[i], reverse=True)

# Plot top features
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.bar([feature_names[i] for i in indices[:10]], [importances[i] for i in indices[:10]])
plt.xticks(rotation=45, ha='right')
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
# Filter to only large fires (from model logic or class F/G)
large_fires = fires[fires['FIRE_SIZE_CLASS'].isin(['F', 'G'])]

# Then group by location
top_large_locations = large_fires.groupby(['LATITUDE', 'LONGITUDE']).size().reset_index(name='count')
top_10_large_locations = top_large_locations.sort_values(by='count', ascending=False).head(10)

print(top_10_large_locations)

In [None]:
top_avg_size = large_fires.groupby(['LATITUDE', 'LONGITUDE'])['FIRE_SIZE'].mean().reset_index(name='avg_fire_size')
top_10_avg = top_avg_size.sort_values(by='avg_fire_size', ascending=False).head(10)

print(top_10_avg)

Now attempting logistic regression to asses largest determinents for any fire

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# Columns to use
cols = ['FIRE_SIZE', 'Precipitation_Total', 'Temperature_Average', 'Cloudiness_Average', 'Temperature_Max']

# Drop rows with missing values
df_clean = fires[cols].dropna()

# Combine all features
X = df_clean[['Precipitation_Total', 'Temperature_Average', 'Cloudiness_Average', 'Temperature_Max']]

# Target variable
y = df_clean['FIRE_SIZE']

# Final cleaning: ensure all columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Drop rows with any NaNs
X, y = X.align(y, join='inner', axis=0)
X = X.dropna()
y = y.loc[X.index]

# Add intercept and convert to float
X = sm.add_constant(X).astype(float)
y = y.astype(float)

# Fit model
model = sm.OLS(y, X)
results = model.fit()

# View results
print(results.summary())

In [None]:
df = fires.copy()
# find numerical variables
# numerical = [var for var in fires.columns if fires[var].dtype!='O']
# print('There are {} numerical variables\n'.format(len(numerical)))
# print('The numerical variables are :', numerical)

from sklearn.model_selection import train_test_split

X = df.drop(['FIPS',
             'FOD_ID',
             'DISCOVERY_DOY',
             'DISCOVERY_TIME',
             'DISCOVERY_DATE',
             'LATITUDE',
             'LONGITUDE',
             'FIPS_CODE',
             'ID',
             'FIPS',
             'Filtered_fires_County',
             'CONT_DATE',
             'STATE',
             'NOAA_Climate_Data_combined_County',
             'Date'],
             axis=1)

y = df['FIRE_SIZE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# # check the shape of X_train and X_test
# categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
# print(categorical)
# numerical = [col for col in X_train.columns if X_train[col].dtypes != 'O']
# # print(numerical)

# encode RainToday variable

import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['NWCG_CAUSE_CLASSIFICATION', 'NWCG_GENERAL_CAUSE', 'FIRE_SIZE_CLASS'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)





# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression


# instantiate the model
logreg = LogisticRegression(solver='liblinear', random_state=0)


# fit the model
# logreg.fit(X_train, y_train)

X_train.head()

In [None]:
import requests

my_headers = {'token' : 'QKnPTjSfJLNAbQnBiGgzXaarAtljJMyB'}
response = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/datatypes', headers=my_headers)
print(response.json())