# Wildfire Size Class Prediction

In [15]:
import pandas as pd
import numpy as np
import pandas as pd
import datetime as dt
import sqlite3
import julian

## Load in data

In [10]:
# Select only pertinent features (this should make loading in data faster)
con = sqlite3.connect("wildfire_data.sqlite")
query="""
SELECT 
FIRE_NAME,
FIRE_SIZE_CLASS,
STAT_CAUSE_DESCR, STAT_CAUSE_CODE,
STATE, COUNTY,
LONGITUDE, LATITUDE,
DISCOVERY_DATE, 
DISCOVERY_TIME, 
CONT_DATE, 
CONT_TIME 
from Fires
"""
query=query.strip()
df = pd.read_sql_query(query, con)
con.close();

In [11]:
print(df.shape)
df.head()

(1880465, 12)


Unnamed: 0,FIRE_NAME,FIRE_SIZE_CLASS,STAT_CAUSE_DESCR,STAT_CAUSE_CODE,STATE,COUNTY,LONGITUDE,LATITUDE,DISCOVERY_DATE,DISCOVERY_TIME,CONT_DATE,CONT_TIME
0,FOUNTAIN,A,Miscellaneous,9.0,CA,63,-121.005833,40.036944,2453403.5,1300,2453403.5,1730
1,PIGEON,A,Lightning,1.0,CA,61,-120.404444,38.933056,2453137.5,845,2453137.5,1530
2,SLACK,A,Debris Burning,5.0,CA,17,-120.735556,38.984167,2453156.5,1921,2453156.5,2024
3,DEER,A,Lightning,1.0,CA,3,-119.913333,38.559167,2453184.5,1600,2453189.5,1400
4,STEVENOT,A,Lightning,1.0,CA,3,-119.933056,38.559167,2453184.5,1600,2453189.5,1200


## Drop rows with missing data
Given our data is not sparse at all, we have the freedom to just drop all rows that are missing data we care about.

In [13]:
# drop the following rows if they have missing data for the following features
needed_cols = ['FIRE_SIZE_CLASS', 'DISCOVERY_DATE', 'DISCOVERY_TIME', 'CONT_DATE', 'CONT_TIME', 'STAT_CAUSE_CODE', 'STATE', 'LONGITUDE', 'LATITUDE']
df = df.dropna(subset=needed_cols) # remove rows where both of these are missing
df.shape

(892007, 12)

## Create new columns to work with

Map fire size class to integers so they can be ordered.

In [56]:
di = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
df['FIRE_SIZE_CLASS'] = df['FIRE_SIZE_CLASS'].map(di)

Map states to indices to help handle categorical

In [96]:
states = df.STATE.unique()
ind = states.argsort(axis=0)
state_di = {states[i]: i for i in ind}
df['STATE_CODE']=df['STATE'].map(state_di)

Convert the date/time columns to datetime objects. Originally they are in julian time. Also calculate the time to containment (time delta of containment date - discovery date)

In [16]:
#To make these dates and times easier to manage, let's convert them to datetime. We can add new columns DISCOVERY_DATETIME and CONTAINMENT_DATETIME.
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATE'];
df['CONT_DATETIME'] = df['CONT_DATE'];

In [17]:
#To populate those two rows, let's convert them into datetime.
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATETIME'].apply(lambda x: julian.from_jd(x, fmt="jd"))
df['CONT_DATETIME'] = df['CONT_DATETIME'].apply(lambda x:julian.from_jd(x, fmt="jd"))


In [18]:
#Let's also add the time 
temp_df = pd.DataFrame();
temp_df['dt'] = df['DISCOVERY_TIME'].apply(lambda x: dt.timedelta(hours=int(x[0:2]), minutes=int(x[2:5])))
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATETIME'] + temp_df['dt']
df['DISCOVERY_DATETIME'].head()

0   2005-02-02 13:00:00
1   2004-05-12 08:45:00
2   2004-05-31 19:21:00
3   2004-06-28 16:00:00
4   2004-06-28 16:00:00
Name: DISCOVERY_DATETIME, dtype: datetime64[ns]

In [19]:
#Do the same thing for CONT_DATETIME
temp_df = pd.DataFrame();
temp_df['dt'] = df['CONT_TIME'].apply(lambda x: dt.timedelta(hours=int(x[0:2]), minutes=int(x[2:5])))
df['CONT_DATETIME'] = df['CONT_DATETIME'] + temp_df['dt']
df['CONT_DATETIME'].head()

0   2005-02-02 17:30:00
1   2004-05-12 15:30:00
2   2004-05-31 20:24:00
3   2004-07-03 14:00:00
4   2004-07-03 12:00:00
Name: CONT_DATETIME, dtype: datetime64[ns]

In [20]:
df.to_csv("data_with_target.csv")
#Checkpoint to save the data with the above columns before reloading

In [5]:
df = pd.read_csv("data_with_target.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:
dict(zip(df.STAT_CAUSE_CODE,df.STAT_CAUSE_DESCR))

{9.0: 'Miscellaneous',
 1.0: 'Lightning',
 5.0: 'Debris Burning',
 4.0: 'Campfire',
 2.0: 'Equipment Use',
 7.0: 'Arson',
 8.0: 'Children',
 6.0: 'Railroad',
 3.0: 'Smoking',
 11.0: 'Powerline',
 12.0: 'Structure',
 10.0: 'Fireworks',
 13.0: 'Missing/Undefined'}

In [25]:
df['TIME_TO_CONT'] = df['CONT_DATETIME'] - df['DISCOVERY_DATETIME']

df['TIME_TO_CONT'].describe()

count                     892007
mean      1 days 06:44:52.524004
std      13 days 19:33:02.720135
min              0 days 00:00:00
25%              0 days 00:30:00
50%              0 days 01:28:00
75%              0 days 04:45:00
max           3653 days 01:30:00
Name: TIME_TO_CONT, dtype: object

In [40]:
# convert to hours for easier sorting
df['HOURS_TO_CONT'] = df['TIME_TO_CONT'].apply(lambda x: round(float(x.total_seconds()/60/60)),2)
df.HOURS_TO_CONT.describe()

count    892007.000000
mean         30.728347
std         331.551520
min           0.000000
25%           0.000000
50%           1.000000
75%           5.000000
max       87674.000000
Name: HOURS_TO_CONT, dtype: float64

## Investigate the outliers in HOURS_TO_CONT 

75% is only 5 hours to cont, but max is abnormally high. 
Some dates seem to be incorrect (in record) -- month/day and time of day (h/m) are correct but wrong only in year.

Be careful because some differences in year are okay (Dec/Jan fires).

Since total dataset is 892007 instances, and of those, only 6000 are greater than 1000 hours to cont, these are definitely outliers and may need to be omitted from training for better accuracy/performance. We could also use a model that is insensitive to outliers.

In [71]:
df[df.HOURS_TO_CONT > 1000].HOURS_TO_CONT.describe()

count     5994.000000
mean      2289.123624
std       3257.553581
min       1001.000000
25%       1340.250000
50%       1776.000000
75%       2392.750000
max      87674.000000
Name: HOURS_TO_CONT, dtype: float64

In [61]:
df[(df.HOURS_TO_CONT > 8000) & (df.FIRE_SIZE_CLASS > 2)].sort_values(by='FIRE_SIZE_CLASS')

Unnamed: 0,FIRE_NAME,FIRE_SIZE_CLASS,STAT_CAUSE_DESCR,STAT_CAUSE_CODE,STATE,COUNTY,LONGITUDE,LATITUDE,DISCOVERY_DATE,DISCOVERY_TIME,CONT_DATE,CONT_TIME,DISCOVERY_DATETIME,CONT_DATETIME,TIME_TO_CONT,HOURS_TO_CONT
182647,ML5,3,Children,8.0,MN,,-93.7653,46.2127,2449480.5,1450,2449845.5,1600,1994-05-08 14:50:00,1995-05-08 16:00:00,365 days 01:10:00,8761
256371,TRACKS,3,Arson,7.0,SD,,-100.5407,45.67,2452773.5,2120,2453139.5,2300,2003-05-14 21:20:00,2004-05-14 23:00:00,366 days 01:40:00,8786
291193,RALSTON,3,Miscellaneous,9.0,OK,,-96.7528,36.5078,2453385.5,1215,2453750.5,1330,2005-01-15 12:15:00,2006-01-15 13:30:00,365 days 01:15:00,8761
1286206,ADON ROAD,3,Lightning,1.0,WY,Campbell,-105.27615,44.45434,2455430.5,2059,2455804.5,2059,2010-08-22 20:59:00,2011-08-31 20:59:00,374 days 00:00:00,8976
365024,2100 CENTE,3,Miscellaneous,9.0,IN,,-85.4232,38.9379,2452004.5,1300,2452379.5,1200,2001-04-05 13:00:00,2002-04-15 12:00:00,374 days 23:00:00,8999
1276621,,3,Debris Burning,5.0,PA,FAYETTE,-79.827778,40.017222,2452352.5,1007,2453448.5,1007,2002-03-19 10:07:00,2005-03-19 10:07:00,1096 days 00:00:00,26304
1063028,FOOT,3,Miscellaneous,9.0,CA,63,-121.0525,39.803889,2453557.5,1638,2453928.5,1800,2005-07-06 16:38:00,2006-07-12 18:00:00,371 days 01:22:00,8905
1234308,EARLY MORNING,3,Missing/Undefined,13.0,AZ,,-109.923,31.9705,2454581.5,217,2454946.5,217,2008-04-25 02:17:00,2009-04-25 02:17:00,365 days 00:00:00,8760
1579610,BIRCH CREEK,4,Missing/Undefined,13.0,OR,Malheur,-117.29,44.3222,2456090.5,1756,2456456.5,1830,2012-06-12 17:56:00,2013-06-13 18:30:00,366 days 00:34:00,8785
1324066,JIMS BRANCH,4,Arson,7.0,WV,Wyoming,-81.349864,37.511007,2449661.5,1310,2450393.5,1310,1994-11-05 13:10:00,1996-11-06 13:10:00,732 days 00:00:00,17568


In [70]:
df[
    (df['DISCOVERY_DATETIME'].apply(lambda x: x.year) != df['CONT_DATETIME'].apply(lambda x: x.year))
    &
    (df['HOURS_TO_CONT'] > 200)
]

Unnamed: 0,FIRE_NAME,FIRE_SIZE_CLASS,STAT_CAUSE_DESCR,STAT_CAUSE_CODE,STATE,COUNTY,LONGITUDE,LATITUDE,DISCOVERY_DATE,DISCOVERY_TIME,CONT_DATE,CONT_TIME,DISCOVERY_DATETIME,CONT_DATETIME,TIME_TO_CONT,HOURS_TO_CONT
25280,DOMKE,7,Lightning,1.0,WA,7,-120.599444,48.167222,2454317.5,1900,2454472.5,0900,2007-08-05 19:00:00,2008-01-07 09:00:00,154 days 14:00:00,3710
34763,LICK RUN,2,Lightning,1.0,VA,15,-79.146944,38.368611,2454583.5,1630,2454948.5,2100,2008-04-27 16:30:00,2009-04-27 21:00:00,365 days 04:30:00,8764
48658,,1,Lightning,1.0,CA,,-118.708333,36.295000,2448922.5,1800,2449287.5,1930,1992-10-27 18:00:00,1993-10-27 19:30:00,365 days 01:30:00,8762
80774,VICTORIA,1,Miscellaneous,9.0,CA,,-117.180000,34.243333,2449726.5,1409,2450091.5,1500,1995-01-09 14:09:00,1996-01-09 15:00:00,365 days 00:51:00,8761
114369,FREIDLEIN,2,Campfire,4.0,AZ,5,-111.691667,35.291667,2451530.5,0950,2451547.5,1600,1999-12-18 09:50:00,2000-01-04 16:00:00,17 days 06:10:00,414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782782,,1,Miscellaneous,9.0,NJ,Ocean,-74.275800,40.059000,2456982.5,1000,2457024.5,1000,2014-11-21 10:00:00,2015-01-02 10:00:00,42 days 00:00:00,1008
1782784,WC - 131,3,Miscellaneous,9.0,WA,045,-123.472000,47.309700,2456977.5,1830,2457045.5,1430,2014-11-16 18:30:00,2015-01-23 14:30:00,67 days 20:00:00,1628
1782785,WATSON ORCHARD,1,Miscellaneous,9.0,WA,047,-119.897600,48.067900,2456854.5,1200,2457063.5,1200,2014-07-16 12:00:00,2015-02-10 12:00:00,209 days 00:00:00,5016
1782786,RISING EAGLE ROAD,5,Miscellaneous,9.0,WA,047,-120.146000,48.412900,2456870.5,1345,2457063.5,1200,2014-08-01 13:45:00,2015-02-10 12:00:00,192 days 22:15:00,4630


### Conclusion on outliers:

Probably best to drop ones with greater than 3000 hours. Looking at the fires that have > 1000 hours, most are within 2392. Safe to drop those with more as they are likely outliers.

In [69]:
#Save intermediate copy
df.to_pickle("data_with_target.pkl")

## Test model

In [72]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [157]:
# drop cols not used in model
drop_cols = ['FIRE_SIZE_CLASS', 'FIRE_NAME', 'STAT_CAUSE_DESCR', 'STATE', 'COUNTY', 'DISCOVERY_DATE',
            'DISCOVERY_DATETIME', 'CONT_DATE', 'CONT_DATETIME', 'TIME_TO_CONT']
X = df.drop(drop_cols, axis=1)
y = df['FIRE_SIZE_CLASS']

In [158]:
# create testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,test_size=0.2, random_state=1)

# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=True, test_size=0.25, random_state=1)

In [159]:
md = 10
clf = DecisionTreeClassifier(max_depth=md).fit(X_train, y_train)

In [160]:
print("Tree depth: ", clf.get_depth())
print("# of leaves: ", clf.get_n_leaves())
print("Feature importance: ")
for feature, value in dict(zip(X.columns.values, clf.feature_importances_)).items():
    print(f"\t{feature:8}: {round(value,3)}")
print("Train score: ", clf.score(X_train, y_train))
print("Test score: ", clf.score(X_test, y_test))

Tree depth:  10
# of leaves:  953
Feature importance: 
	STAT_CAUSE_CODE: 0.078
	LONGITUDE: 0.39
	LATITUDE: 0.11
	DISCOVERY_TIME: 0.011
	CONT_TIME: 0.015
	HOURS_TO_CONT: 0.27
	STATE_CODE: 0.125
Train score:  0.6427897451994851
Test score:  0.6365791863319918


In [168]:
# remove "outliers" and see if improvement
X = df[df.HOURS_TO_CONT < 3000].drop(drop_cols, axis=1)
y = df[df.HOURS_TO_CONT < 3000]['FIRE_SIZE_CLASS']
# create testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,test_size=0.2, random_state=1)

# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=True,test_size=0.25, random_state=1)

In [169]:
md = 10
clf = DecisionTreeClassifier(max_depth=md).fit(X_train, y_train)
print("Tree depth: ", clf.get_depth())
print("# of leaves: ", clf.get_n_leaves())
print("Feature importance: ")
for feature, value in dict(zip(X.columns.values, clf.feature_importances_)).items():
    print(f"\t{feature:8}: {round(value,3)}")
print("Train score: ", clf.score(X_train, y_train))
print("Test score: ", clf.score(X_test, y_test))

Tree depth:  10
# of leaves:  968
Feature importance: 
	STAT_CAUSE_CODE: 0.076
	LONGITUDE: 0.395
	LATITUDE: 0.112
	DISCOVERY_TIME: 0.01
	CONT_TIME: 0.017
	HOURS_TO_CONT: 0.27
	STATE_CODE: 0.121
Train score:  0.6435529121310739
Test score:  0.6375455193271276


## Try undersampling to improve performance

In [163]:
df.FIRE_SIZE_CLASS.value_counts()

2    386748
1    380835
3     92523
4     14622
5      8544
6      5584
7      3151
Name: FIRE_SIZE_CLASS, dtype: int64

In [180]:
# skewed data, let's undersample 1 and 2
sample2=df[df.FIRE_SIZE_CLASS == 2].sample(10000)
sample1=df[df.FIRE_SIZE_CLASS == 1].sample(10000)
sample3=df[df.FIRE_SIZE_CLASS == 3].sample(10000)

In [181]:
sampled_df= pd.concat([sample1, sample2, sample3,df[(df.FIRE_SIZE_CLASS != 1) & (df.FIRE_SIZE_CLASS != 2)]])
print(sampled_df.shape)
sampled_df.head()

(154424, 17)


Unnamed: 0,FIRE_NAME,FIRE_SIZE_CLASS,STAT_CAUSE_DESCR,STAT_CAUSE_CODE,STATE,COUNTY,LONGITUDE,LATITUDE,DISCOVERY_DATE,DISCOVERY_TIME,CONT_DATE,CONT_TIME,DISCOVERY_DATETIME,CONT_DATETIME,TIME_TO_CONT,HOURS_TO_CONT,STATE_CODE
312500,UNNAMED FIRE 0693,1,Arson,7.0,CA,Kern,-119.406903,35.054822,2453755.5,1400,2453755.5,1400,2006-01-20 14:00:00,2006-01-20 14:00:00,00:00:00,0,0
1220677,JG PILE,1,Debris Burning,5.0,WI,Dane,-89.76402,43.0492,2454936.5,1310,2454936.5,1310,2009-04-15 13:10:00,2009-04-15 13:10:00,00:00:00,0,34
352995,BLOCK,1,Campfire,4.0,TX,,-101.6644,35.7211,2455102.5,1030,2455102.5,1040,2009-09-28 10:30:00,2009-09-28 10:40:00,00:10:00,0,15
317188,UNNAMED FIRE 2503,1,Smoking,3.0,CA,Riverside,-117.275714,33.749322,2454262.5,924,2454262.5,925,2007-06-11 09:24:00,2007-06-11 09:25:00,00:01:00,0,0
70804,,1,Lightning,1.0,CA,,-122.553333,41.048333,2449623.5,1730,2449623.5,1800,1994-09-28 17:30:00,1994-09-28 18:00:00,00:30:00,0,0


In [182]:
# remove "outliers" and see if improvement
X = sampled_df[df.HOURS_TO_CONT < 3000].drop(drop_cols, axis=1)
y = sampled_df[df.HOURS_TO_CONT < 3000]['FIRE_SIZE_CLASS']
# create testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)

# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True,random_state=1)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [186]:
md = 5
clf = DecisionTreeClassifier(max_depth=md).fit(X_train, y_train)
print("Tree depth: ", clf.get_depth())
print("# of leaves: ", clf.get_n_leaves())
print("Feature importance: ")
for feature, value in dict(zip(X.columns.values, clf.feature_importances_)).items():
    print(f"\t{feature:8}: {round(value,3)}")
print("Train score: ", clf.score(X_train, y_train))
print("Test score: ", clf.score(X_test, y_test))

Tree depth:  5
# of leaves:  32
Feature importance: 
	STAT_CAUSE_CODE: 0.05
	LONGITUDE: 0.35
	LATITUDE: 0.008
	DISCOVERY_TIME: 0.0
	CONT_TIME: 0.017
	HOURS_TO_CONT: 0.411
	STATE_CODE: 0.164
Train score:  0.6781333419792713
Test score:  0.6742210550205882


**Conclusion**: performance improved a lot

## Even sampling across everything

Sample so everything is max the lowest count of fire size classes.

In [172]:
# skewed data, let's undersample 1 and 2
sample1=df[df.FIRE_SIZE_CLASS == 1].sample(3151)
sample2=df[df.FIRE_SIZE_CLASS == 2].sample(3151)
sample3=df[df.FIRE_SIZE_CLASS == 3].sample(3151)
sample4=df[df.FIRE_SIZE_CLASS == 4].sample(3151)
sample5=df[df.FIRE_SIZE_CLASS == 5].sample(3151)
sample6=df[df.FIRE_SIZE_CLASS == 6].sample(3151)
sample7=df[df.FIRE_SIZE_CLASS==7]

sampled_df= pd.concat([sample1, sample2, sample3, sample4, sample5, sample6, sample7])
print(sampled_df.shape)

(22057, 17)


In [177]:
# remove "outliers" and see if improvement
X = sampled_df[df.HOURS_TO_CONT < 8000].drop(drop_cols, axis=1)
y = sampled_df[df.HOURS_TO_CONT < 8000]['FIRE_SIZE_CLASS']
# create testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)

# create training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True,random_state=1)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [178]:
md = 10
clf = DecisionTreeClassifier(max_depth=md).fit(X_train, y_train)
print("Tree depth: ", clf.get_depth())
print("# of leaves: ", clf.get_n_leaves())
print("Feature importance: ")
for feature, value in dict(zip(X.columns.values, clf.feature_importances_)).items():
    print(f"\t{feature:8}: {round(value,3)}")
print("Train score: ", clf.score(X_train, y_train))
print("Test score: ", clf.score(X_test, y_test))

Tree depth:  10
# of leaves:  717
Feature importance: 
	STAT_CAUSE_CODE: 0.046
	LONGITUDE: 0.216
	LATITUDE: 0.141
	DISCOVERY_TIME: 0.06
	CONT_TIME: 0.064
	HOURS_TO_CONT: 0.427
	STATE_CODE: 0.046
Train score:  0.5544554455445545
Test score:  0.4180457946043981


**Conclusion**: performance degraded drastically by overrepresenting the rarer fire classes

## What next

Select best max depth decision tree?

Ensemble method Random Forest?