In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [5]:
def get_fire_data():
    fire_count = pd.read_json('./datasets/fire_counties.json')
    fire_county = pd.read_json('./datasets/county_names.json')
    fires_by_county = fire_county.merge(fire_count, left_index=True, right_index=True).rename(columns={0:'County'}).set_index('County')
    return fires_by_county

ok so the map w/ the dots persisting is important bc it represents california being lit on fire
and the takeaway from the model is if the year has a positive coefficient then we can tell story of fires being more common over time

In [117]:
df = get_fire_data().T.reset_index().rename(columns={'index':'date'})

df = df.assign(year=df['date'].apply(lambda x : x.year)).assign(month=df['date'].apply(lambda x : x.month)).assign(day=df['date'].apply(lambda x : x.day))
by_county = list(df.columns)[1:59]


county_dict = {}
for i in range(len(by_county)):
    county_dict[by_county[i]] = i

prepped_data = ['county,year,month,day,fire\n']

for county in by_county:
    for i in range(df.shape[0]):
        prepped_data.append(str(county_dict[county]) + ',' + str(df['year'].iloc[i]) + ',' + str(df['month'].iloc[i]) + ',' + str(df['day'].iloc[i]) + ',' + str(df[county].iloc[i] > 0) + '\n')

In [118]:
with open('fire_data.csv', 'w') as f:
    for data in prepped_data:
        f.write(data)

In [119]:
fire_data_prepped = pd.read_csv('./datasets/fire_data.csv')
# fire_data_prepped['county'] = fire_data_prepped['county'].apply(lambda x : county_dict[x])

In [120]:
fire_data_prepped.shape[0]

231826

In [152]:
Xtrain = fire_data_prepped[['county', 'year', 'month', 'day']].iloc[0:200000]
ytrain = fire_data_prepped[['fire']].iloc[0:200000]

Xtest = fire_data_prepped[['county', 'year', 'month', 'day']].iloc[200000:]
ytest = fire_data_prepped[['fire']].iloc[200000:]

log_reg = sm.Logit(ytrain, Xtrain).fit()

yhat = log_reg.predict(Xtest)
prediction = list(map(lambda x : x > .018, yhat))

ytest = np.asarray(ytest)
ytest = list(np.ravel(ytest))
acc = 0
TP, FP, TN, FN = 0, 0, 0, 0
for i in range(len(ytest)):
    # print(ytest[i], prediction[i])
    if ytest[i] == True and prediction[i] == True:
        TP += 1
    elif ytest[i] == False and prediction[i] == True:
        FP += 1
    elif ytest[i] == False and prediction[i] == False:
        TN += 1
    elif ytest[i] == True and prediction[i] == False:
        FN += 1
        


acc = (TP + TN) / (TP + FP + TN + FN)
acc, TP, FP, TN, FN

Optimization terminated successfully.
         Current function value: 0.081698
         Iterations 8


(0.6652736756111356, 156, 10406, 21017, 247)

In [79]:
log_reg.summary()

0,1,2,3
Dep. Variable:,fire,No. Observations:,200000.0
Model:,Logit,Df Residuals:,199996.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 01 Dec 2025",Pseudo R-squ.:,0.02221
Time:,11:52:23,Log-Likelihood:,-16340.0
converged:,True,LL-Null:,-16711.0
Covariance Type:,nonrobust,LLR p-value:,1.517e-160

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
county,-0.0036,0.001,-2.951,0.003,-0.006,-0.001
year,-0.0025,3.17e-05,-78.547,0.000,-0.003,-0.002
month,0.1441,0.006,26.180,0.000,0.133,0.155
day,-0.0018,0.002,-0.922,0.356,-0.006,0.002


In [None]:
df_pred = pd.read_csv('./datasets/wildfires_ca.csv')
df_county_lines = pd.read_csv('./datasets/county_bounds.csv')
set(df_pred['acq_date'].apply(lambda x : x.split('-')[0]))

In [None]:
def predict_coords(lat, lon):
    #read in csv file
    df_pred = pd.read_csv('./datasets/wildfires_ca.csv')
    
    #round all lats and lons to make prediction easy
    df_pred['latitude'] = df_pred['latitude'].apply(lambda x : round(x))
    df_pred['longitude'] = df_pred['longitude'].apply(lambda x : round(x))
    
    #groupby lat and lon, then divide by total number of fires overall. whatever lat and lon were passed will predict based on the proportion of fires at that lat and lon
    fires_prop_pred = df_pred[['latitude', 'longitude', 'brightness']].groupby(['latitude', 'longitude']).count().apply(lambda x : x / df_pred.shape[0])
    
    #setting up for values greater or less then the ones at that specific lat/lon index
    indices = fires_prop_pred.index
    lat_indices = set([x[0] for x in indices])
    
    #if lat/lon is less than/greater than min/max value in index, then set to that value respectively
    if lat < min(lat_indices):
        lat = min(lat_indices)
    elif lat > max(lat_indices):
        lat = max(lat_indices)
    
    lon_indices = fires_prop_pred.loc[lat].index
    if lon < min(lon_indices):
        lon = min(lon_indices)
    elif lon > max(lon_indices):
        lon = max(lon_indices)
    
    #actually doing the prediction
    pred = fires_prop_pred.loc[lat].loc[lon].iloc[0]
    pred = float(pred) * 100
    
    return str(round(pred, 2)) + '% chance of fire at ' + str(lat) + ' and ' + str(lon)

    
    

In [None]:
predict_coords(36, -119)

In [None]:
def predict_county(county, format='float'):
    fire_count = pd.read_json('./datasets/fire_counties.json')
    fire_county = pd.read_json('./datasets/county_names.json')
    fires_by_county = fire_county.merge(fire_count, left_index=True, right_index=True).rename(columns={0:'County'}).set_index('County')
    fires_total_by_county = fires_by_county.T.sum()
    total_fires = int(fires_by_county.T.sum().sum())
    if county in fires_total_by_county.index:
        pred = round(float(pd.DataFrame(fires_total_by_county).rename(columns={0:'Count'}).apply(lambda x : (x / total_fires)).loc[county].iloc[0]), 4)
        if format == 'str':
            return f'{pred}% chance of fire in {county}'
        elif format == 'float':
            return pred
    else:
        return f'{county} is not in the county list.'
    

In [None]:
predict_county('Los Angeles'), predict_county('New Mexico')