In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:
def get_fire_data():
    fire_count = pd.read_json('../datasets/fire_counties.json')
    fire_county = pd.read_json('../datasets/county_names.json')
    fires_by_county = fire_county.merge(fire_count, left_index=True, right_index=True).rename(columns={0:'County'}).set_index('County')
    return fires_by_county

ok so the map w/ the dots persisting is important bc it represents california being lit on fire
and the takeaway from the model is if the year has a positive coefficient then we can tell story of fires being more common over time

In [5]:
df = get_fire_data().T.reset_index().rename(columns={'index':'date'})

df = df.assign(year=df['date'].apply(lambda x : x.year)).assign(month=df['date'].apply(lambda x : x.month)).assign(day=df['date'].apply(lambda x : x.day))
by_county = list(df.columns)[1:59]


county_dict = {}
for i in range(len(by_county)):
    county_dict[by_county[i]] = i

prepped_data = ['county,year,month,day,fire\n']

for county in by_county:
    for i in range(df.shape[0]):
        prepped_data.append(str(county_dict[county]) + ',' + str(df['year'].iloc[i]) + ',' + str(df['month'].iloc[i]) + ',' + str(df['day'].iloc[i]) + ',' + str(df[county].iloc[i] > 0) + '\n')

In [6]:
with open('fire_data.csv', 'w') as f:
    for data in prepped_data:
        f.write(data)

In [44]:
df

County,date,Alameda,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,Tuolumne,...,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Joaquin,San Luis Obispo,year,month,day
0,2014-01-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2014,1,3
1,2014-01-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2014,1,4
2,2014-01-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2014,1,5
3,2014-01-06,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2014,1,6
4,2014-01-07,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2014,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3992,2024-12-08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024,12,8
3993,2024-12-09,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024,12,9
3994,2024-12-10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024,12,10
3995,2024-12-11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024,12,11


In [8]:
fire_data_prepped = pd.read_csv('../datasets/fire_data.csv')
# fire_data_prepped['county'] = fire_data_prepped['county'].apply(lambda x : county_dict[x])

In [9]:
fire_data_prepped.shape[0]

231826

In [10]:
Xtrain = fire_data_prepped[['county', 'year', 'month', 'day']].iloc[0:200000]
ytrain = fire_data_prepped[['fire']].iloc[0:200000]

Xtest = fire_data_prepped[['county', 'year', 'month', 'day']].iloc[200000:]
ytest = fire_data_prepped[['fire']].iloc[200000:]

log_reg = sm.Logit(ytrain, Xtrain).fit()

yhat = log_reg.predict(Xtest)
prediction = list(map(lambda x : x > .018, yhat))

ytest = np.asarray(ytest)
ytest = list(np.ravel(ytest))
acc = 0
TP, FP, TN, FN = 0, 0, 0, 0
for i in range(len(ytest)):
    # print(ytest[i], prediction[i])
    if ytest[i] == True and prediction[i] == True:
        TP += 1
    elif ytest[i] == False and prediction[i] == True:
        FP += 1
    elif ytest[i] == False and prediction[i] == False:
        TN += 1
    elif ytest[i] == True and prediction[i] == False:
        FN += 1
        


acc = (TP + TN) / (TP + FP + TN + FN)
acc, TP, FP, TN, FN

Optimization terminated successfully.
         Current function value: 0.081698
         Iterations 8


(0.6652736756111356, 156, 10406, 21017, 247)

In [11]:
log_reg.summary()

0,1,2,3
Dep. Variable:,fire,No. Observations:,200000.0
Model:,Logit,Df Residuals:,199996.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 01 Dec 2025",Pseudo R-squ.:,0.02221
Time:,19:46:49,Log-Likelihood:,-16340.0
converged:,True,LL-Null:,-16711.0
Covariance Type:,nonrobust,LLR p-value:,1.517e-160

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
county,-0.0036,0.001,-2.951,0.003,-0.006,-0.001
year,-0.0025,3.17e-05,-78.547,0.000,-0.003,-0.002
month,0.1441,0.006,26.180,0.000,0.133,0.155
day,-0.0018,0.002,-0.922,0.356,-0.006,0.002


In [24]:
log_reg.predict(pd.DataFrame({'county':[15], 'year':[2040], 'month': [8], 'day':[7]})) > .018

0    True
dtype: bool

In [25]:
county_dict

{'Alameda': 0,
 'Alpine': 1,
 'Amador': 2,
 'Butte': 3,
 'Calaveras': 4,
 'Colusa': 5,
 'Contra Costa': 6,
 'Del Norte': 7,
 'Tuolumne': 8,
 'El Dorado': 9,
 'Fresno': 10,
 'Glenn': 11,
 'Humboldt': 12,
 'Inyo': 13,
 'Lake': 14,
 'San Diego': 15,
 'San Francisco': 16,
 'Sierra': 17,
 'Siskiyou': 18,
 'Solano': 19,
 'Sonoma': 20,
 'Los Angeles': 21,
 'Madera': 22,
 'Marin': 23,
 'Mariposa': 24,
 'Mendocino': 25,
 'Merced': 26,
 'Modoc': 27,
 'Tulare': 28,
 'Imperial': 29,
 'Santa Cruz': 30,
 'Shasta': 31,
 'Stanislaus': 32,
 'Sutter': 33,
 'Tehama': 34,
 'Trinity': 35,
 'Kern': 36,
 'San Mateo': 37,
 'Santa Barbara': 38,
 'Santa Clara': 39,
 'Ventura': 40,
 'Yolo': 41,
 'Yuba': 42,
 'Lassen': 43,
 'Kings': 44,
 'Mono': 45,
 'Monterey': 46,
 'Napa': 47,
 'Nevada': 48,
 'Orange': 49,
 'Placer': 50,
 'Plumas': 51,
 'Riverside': 52,
 'Sacramento': 53,
 'San Benito': 54,
 'San Bernardino': 55,
 'San Joaquin': 56,
 'San Luis Obispo': 57}