In [1]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
import featuretools as ft
import numpy as np

## frequency of monthly wildfire by states ##

In [2]:
df = pd.read_csv('./DATA/etl_filtered_v1.csv')
df['month'] = pd.DatetimeIndex(df['datetime']).month
df

Unnamed: 0.1,Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,STAT_CAUSE_DESCR,STAT_CAUSE_CODE,CONT_DATE,DISCOVERY_DATE,CONT_DOY,DISCOVERY_TIME,CONT_TIME,...,STATE,datetime,population_density,ELEVATION,AWND,PRCP,TMAX,TMIN,distance,month
0,0,2011,47,Debris Burning,5.0,2455608.5,2455608.5,47.0,1300.0,1530.0,...,AL,2011-02-16,7.572142,100.6,8.062714,0.0,71.000000,32.000000,0.141421,2
1,1,2011,47,Debris Burning,5.0,2455610.5,2455608.5,49.0,820.0,1500.0,...,AL,2011-02-16,31.442815,201.5,8.048481,0.0,76.942800,49.340830,0.100000,2
2,2,2011,48,Arson,7.0,2455609.5,2455609.5,48.0,2030.0,2300.0,...,AL,2011-02-17,7.572142,100.6,8.064342,0.0,70.000000,33.000000,0.200000,2
3,3,2011,50,Arson,7.0,2455617.5,2455611.5,56.0,1800.0,1500.0,...,AL,2011-02-19,49.942890,182.9,8.037152,0.0,66.000000,49.000000,0.282843,2
4,4,2011,51,Debris Burning,5.0,2455625.5,2455612.5,64.0,1500.0,1900.0,...,AL,2011-02-20,49.942890,189.0,8.042611,0.0,76.657125,49.084371,0.100000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367739,367739,2015,75,Miscellaneous,9.0,,2457097.5,,,,...,VA,2015-03-16,33.568885,283.5,7.366097,0.0,61.000000,39.000000,0.100000,3
367740,367740,2015,95,Miscellaneous,9.0,,2457117.5,,,,...,VA,2015-04-05,33.568885,283.5,7.358285,0.0,57.000000,39.000000,0.100000,4
367741,367741,2015,37,Miscellaneous,9.0,,2457059.5,,,,...,VA,2015-02-06,38.354455,247.8,7.406834,0.0,70.218508,43.916184,0.100000,2
367742,367742,2015,89,Miscellaneous,9.0,,2457111.5,,,,...,VA,2015-03-30,38.354455,208.2,7.384267,0.0,69.862021,43.603630,0.141421,3


In [3]:
year = list(range(2011,2016))
for i in year:
    temp_df = df[df['FIRE_YEAR']==i]
    temp_df = temp_df.groupby(['STATE', 'month']).size().reset_index(name='counts')
    dir = './DATA/monthly_by_year/'
    if not os.path.exists(dir):
        os.mkdir(dir)
    path_name = dir + str(i) + '.csv'
    temp_df.to_csv(path_name)

In [4]:
df.columns

Index(['Unnamed: 0', 'FIRE_YEAR', 'DISCOVERY_DOY', 'STAT_CAUSE_DESCR',
       'STAT_CAUSE_CODE', 'CONT_DATE', 'DISCOVERY_DATE', 'CONT_DOY',
       'DISCOVERY_TIME', 'CONT_TIME', 'FIRE_SIZE', 'FIRE_SIZE_CLASS',
       'LATITUDE_x', 'LONGITUDE_x', 'OWNER_CODE', 'OWNER_DESCR', 'STATE',
       'datetime', 'population_density', 'ELEVATION', 'AWND', 'PRCP', 'TMAX',
       'TMIN', 'distance', 'month'],
      dtype='object')

In [5]:
mean_pd = df[df['population_density']>=0].groupby('STATE')['population_density'].mean().reset_index()
ml_df = df[['month', 'STATE', 'TMAX', 'TMIN', 'AWND']]
ml_df = ml_df.groupby(['month', 'STATE']).agg({'month' : ['count'], 'TMAX' : 'mean', 'TMIN' : 'mean', 'AWND' : 'mean'}).reset_index()
ml_df.columns = ['_'.join(col).strip() for col in ml_df.columns.values]
X = ml_df.merge(mean_pd, how='inner', left_on='STATE_', right_on='STATE')
encoding = pd.get_dummies(X.STATE)
X = X.join(encoding)
X

Unnamed: 0,month_,STATE_,month_count,TMAX_mean,TMIN_mean,AWND_mean,STATE,population_density,AK,AL,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,1,AK,2,26.500000,5.500000,10.850000,AK,3.536666,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,AK,3,21.666667,4.666667,14.121259,AK,3.536666,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,AK,19,37.421053,17.052632,8.016115,AK,3.536666,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,AK,145,51.228985,28.280824,8.344330,AK,3.536666,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,AK,771,65.722132,38.080121,8.136232,AK,3.536666,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,7,DE,7,90.797158,69.309613,5.922331,DE,90.880529,0,0,...,0,0,0,0,0,0,0,0,0,0
599,8,DE,2,77.024819,56.101038,7.442432,DE,90.880529,0,0,...,0,0,0,0,0,0,0,0,0,0
600,9,DE,5,77.768067,56.033829,6.307507,DE,90.880529,0,0,...,0,0,0,0,0,0,0,0,0,0
601,10,DE,1,66.000000,60.000000,7.664107,DE,90.880529,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X.columns

Index(['month_', 'STATE_', 'month_count', 'TMAX_mean', 'TMIN_mean',
       'AWND_mean', 'STATE', 'population_density', 'AK', 'AL', 'AR', 'AZ',
       'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN',
       'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC',
       'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI',
       'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'],
      dtype='object')

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn import linear_model
y = X['month_count']
X = X.drop(['STATE_', 'month_count', 'STATE'], axis=1)

In [8]:
X

Unnamed: 0,month_,TMAX_mean,TMIN_mean,AWND_mean,population_density,AK,AL,AR,AZ,CA,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,1,26.500000,5.500000,10.850000,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,21.666667,4.666667,14.121259,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,37.421053,17.052632,8.016115,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,51.228985,28.280824,8.344330,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,65.722132,38.080121,8.136232,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,7,90.797158,69.309613,5.922331,90.880529,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599,8,77.024819,56.101038,7.442432,90.880529,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
600,9,77.768067,56.033829,6.307507,90.880529,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601,10,66.000000,60.000000,7.664107,90.880529,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
y

0        2
1        3
2       19
3      145
4      771
      ... 
598      7
599      2
600      5
601      1
602      1
Name: month_count, Length: 603, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
import sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = BayesianRidge().fit(X_train, y_train)
train_predictions = model.predict(X_train)
print('model: BayesianRidge')
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
train_predictions = model.predict(X_test)
print("Train R2 score:", sklearn.metrics.r2_score(y_test, train_predictions))

model = LinearRegression().fit(X_train, y_train)
train_predictions = model.predict(X_train)
print('model: LinearRegression')
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
train_predictions = model.predict(X_test)
print("Train R2 score:", sklearn.metrics.r2_score(y_test, train_predictions))

model = linear_model.Ridge(alpha=0.5).fit(X_train, y_train)
train_predictions = model.predict(X_train)
print('model: RidgeRegression')
print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
train_predictions = model.predict(X_test)
print("Train R2 score:", sklearn.metrics.r2_score(y_test, train_predictions))

model: BayesianRidge
Train R2 score: 0.6632515594906144
Train R2 score: 0.5630778877382656
model: LinearRegression
Train R2 score: 0.6666352438691171
Train R2 score: 0.5682813992460143
model: RidgeRegression
Train R2 score: 0.6654998898590743
Train R2 score: 0.5665862724532057


In [11]:
#predict
temp = df[['FIRE_YEAR', 'month', 'STATE', 'AWND', 'TMAX', 'TMIN']]
AWND_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'AWND' : 'mean'}).reset_index()
encoding = pd.get_dummies(AWND_predict.STATE)
AWND_predict = AWND_predict.join(encoding)
y = AWND_predict['AWND']
x = AWND_predict.drop(['STATE', 'AWND'], axis=1)
AWND_model = LinearRegression().fit(x, y)

In [12]:
TMAX_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'TMAX' : 'mean'}).reset_index()
encoding = pd.get_dummies(TMAX_predict.STATE)
TMAX_predict = TMAX_predict.join(encoding)
y = TMAX_predict['TMAX']
x = TMAX_predict.drop(['STATE', 'TMAX'], axis=1)
TMAX_model = LinearRegression().fit(x, y)

In [13]:
TMIN_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'TMIN' : 'mean'}).reset_index()
encoding = pd.get_dummies(TMIN_predict.STATE)
TMIN_predict = TMIN_predict.join(encoding)
y = TMIN_predict['TMIN']
x = TMIN_predict.drop(['STATE', 'TMIN'], axis=1)
TMIN_model = LinearRegression().fit(x, y)

In [14]:
x

Unnamed: 0,FIRE_YEAR,month,AK,AL,AR,AZ,CA,CO,CT,DC,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2011,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2011,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2011,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2681,2015,12,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2682,2015,12,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2683,2015,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2684,2015,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [15]:
temp

Unnamed: 0,FIRE_YEAR,month,STATE,AWND,TMAX,TMIN
0,2011,2,AL,8.062714,71.000000,32.000000
1,2011,2,AL,8.048481,76.942800,49.340830
2,2011,2,AL,8.064342,70.000000,33.000000
3,2011,2,AL,8.037152,66.000000,49.000000
4,2011,2,AL,8.042611,76.657125,49.084371
...,...,...,...,...,...,...
367739,2015,3,VA,7.366097,61.000000,39.000000
367740,2015,4,VA,7.358285,57.000000,39.000000
367741,2015,2,VA,7.406834,70.218508,43.916184
367742,2015,3,VA,7.384267,69.862021,43.603630


In [16]:
states = set(df['STATE'].to_list())
states = list(states)
states.sort()
year = [2022]
months = list(range(1,13))
year_df = pd.DataFrame(year, columns=['FIRE_YEAR'])
month_df = pd.DataFrame(months, columns=['month_'])
State_df = pd.DataFrame(states, columns=['STATE'])
raw = year_df.merge(month_df, how='cross')
raw = raw.merge(State_df, how='cross')

encoding = pd.get_dummies(raw.STATE)
input_df = raw.join(encoding)
input_df = input_df.drop(['STATE'], axis=1)
input_df

Unnamed: 0,FIRE_YEAR,month_,AK,AL,AR,AZ,CA,CO,CT,DC,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,2022,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2022,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,2022,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
608,2022,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
609,2022,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
610,2022,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
raw

Unnamed: 0,FIRE_YEAR,month_,STATE
0,2022,1,AK
1,2022,1,AL
2,2022,1,AR
3,2022,1,AZ
4,2022,1,CA
...,...,...,...
607,2022,12,VT
608,2022,12,WA
609,2022,12,WI
610,2022,12,WV


In [None]:
AWND_2022 = AWND_model.predict(input_df)
TMIN_2022 = TMIN_model.predict(input_df)
TMAX_2022 = TMAX_model.predict(input_df)

In [19]:
# TMAX_mean	TMIN_mean	AWND_mean
raw['TMAX_mean'] = TMAX_2022
raw['TMIN_mean'] = TMIN_2022
raw['AWND_mean'] = AWND_2022
mean_pd = df[df['population_density']>=0].groupby('STATE')['population_density'].mean().reset_index()
raw = raw.merge(mean_pd, how='inner', on='STATE')
raw.join(pd.get_dummies(raw.STATE))
input_2022 = raw.join(pd.get_dummies(raw.STATE))
input_2022
input_2022 = input_2022.drop(['STATE', 'FIRE_YEAR'], axis=1)
input_2022

Unnamed: 0,month_,TMAX_mean,TMIN_mean,AWND_mean,population_density,AK,AL,AR,AZ,CA,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,1,43.355469,23.780853,7.746746,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,43.885498,24.361893,7.724094,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,44.415527,24.942932,7.701447,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,44.945312,25.523972,7.678795,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,45.475342,26.105011,7.656147,3.536666,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,8,64.559326,39.268341,6.975739,3.667393,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
608,9,65.089355,39.849380,6.953087,3.667393,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
609,10,65.619141,40.430420,6.930439,3.667393,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
610,11,66.149170,41.011459,6.907787,3.667393,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
frequency_2022 = model.predict(input_2022)
raw['frequency'] = frequency_2022
raw['frequency'] = raw['frequency'].apply(lambda x: int(x) if x>=0 else 0)
raw[['FIRE_YEAR', 'month_', 'STATE', 'frequency']].to_csv('./DATA/frequency_2022.csv')