# Import and Cleaning Data

In [1]:
# Initial Import of Dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
# Import Wildfire Data and Create Dataframe
fire_df = pd.read_csv("FL2.csv")

In [3]:
# Check Data Types
fire_df.dtypes

fire_year          int64
district          object
unit              object
fire_number       object
fire_name         object
legal             object
latitude          object
longitude         object
fuel_model        object
county            object
report_date       object
general_cause     object
odf_acres        float64
total_acres      float64
dtype: object

In [4]:
# Alt Function to convert DMS to Decimal

#def dms_to_dd(d, m, s):
    #dd = d + float(m)/60 + float(s)/3600
    #return dd

# Converting Lat/Long DMS to Decimal
pattern = r'(?P<d>[\d\.]+).*?(?P<m>[\d\.]+).*?(?P<s>[\d\.]+)'

#Lat
dms = fire_df['latitude'].str.extract(pattern).astype(float)
fire_df['Latitude'] = dms['d'] + dms['m'].div(60) + dms['s'].div(3600)

#Long  
dms = fire_df['longitude'].str.extract(pattern).astype(float)
fire_df['Longitude'] = dms['d'] + dms['m'].div(60) + dms['s'].div(3600)

# Convert all Longtiude to Negative
fire_df['Longitude'] *= -1

fire_df.head(5)

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,latitude,longitude,fuel_model,county,report_date,general_cause,odf_acres,total_acres,Latitude,Longitude
0,2021,51 - Tillamook,Tillamook,1,GRAVEL PIT,T2S R8W S13 SESW,"45° 23' 19.92""","-123° 37' 2.27""",R,Tillamook,7/7/21 9:09,Recreationist,0.01,0.01,45.388867,-123.617297
1,2021,51 - Tillamook,Tillamook,11,101 on 101,T6S R10W S7 NENE,"45° 4' 21.7""","-123° 56' 52.4""",G,Tillamook,7/16/21 11:32,Equipment Use,0.1,0.1,45.072694,-123.947889
2,2021,51 - Tillamook,Tillamook,23,Lost Creek,T3N R8W S28 SESE,"45° 42' 25.98""","-123° 39' 47.4""",K,Tillamook,7/31/21 17:19,Lightning,0.1,0.1,45.707217,-123.663167
3,2021,51 - Tillamook,Tillamook,26,Larson Creek Fire,T1S R11W S24 NENE,"45° 28' 35.4""","-123° 58' 7.68""",H,Tillamook,8/4/21 22:50,Recreationist,0.59,0.59,45.4765,-123.9688
4,2021,51 - Tillamook,Tillamook,30,Cedar Butte,T1N R8W S14 SWSE,"45° 33' 59.21""","-123° 37' 39.6""",J,Tillamook,8/11/21 2:48,Under Invest,74.0,74.0,45.566447,-123.627667


In [5]:
# Drop DMS Lat/Long/odf_acres
fire_df = fire_df.drop(['latitude', 'longitude', "odf_acres"], axis=1)
# Confirm Dropped Columns
fire_df.head(5)

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,fuel_model,county,report_date,general_cause,total_acres,Latitude,Longitude
0,2021,51 - Tillamook,Tillamook,1,GRAVEL PIT,T2S R8W S13 SESW,R,Tillamook,7/7/21 9:09,Recreationist,0.01,45.388867,-123.617297
1,2021,51 - Tillamook,Tillamook,11,101 on 101,T6S R10W S7 NENE,G,Tillamook,7/16/21 11:32,Equipment Use,0.1,45.072694,-123.947889
2,2021,51 - Tillamook,Tillamook,23,Lost Creek,T3N R8W S28 SESE,K,Tillamook,7/31/21 17:19,Lightning,0.1,45.707217,-123.663167
3,2021,51 - Tillamook,Tillamook,26,Larson Creek Fire,T1S R11W S24 NENE,H,Tillamook,8/4/21 22:50,Recreationist,0.59,45.4765,-123.9688
4,2021,51 - Tillamook,Tillamook,30,Cedar Butte,T1N R8W S14 SWSE,J,Tillamook,8/11/21 2:48,Under Invest,74.0,45.566447,-123.627667


In [6]:
# Drop the null columns where all values are null
fire_df = fire_df.dropna(axis='columns', how='all')

# Drop the null rows
fire_df = fire_df.dropna()

In [7]:
# Import LE to convert Fuel Model and General Cause to Numeric Values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Converting Fuel Model to Numeric
label_encoder = LabelEncoder()
fire_df["fueltype_num"] = label_encoder.fit_transform(fire_df["fuel_model"])

In [8]:
# Converting General Cause to Numeric 
label_encoder = LabelEncoder()
fire_df["generalCause_num"] = label_encoder.fit_transform(fire_df["general_cause"])

In [9]:
#Drop All Row where Total Acres is less then 1
#fire_df.drop(fire_df[fire_df.total_acres < 1].index, inplace=True)
#fire_df.head(20)

In [10]:
# Set and Classify Fire Sizes
fire_df.loc[fire_df['total_acres'] <= 10, 'fire_severity'] = 1
fire_df.loc[(fire_df['total_acres'] > 10) & (fire_df['total_acres'] <= 100), 'fire_severity'] = 2
fire_df.loc[(fire_df['total_acres'] > 100) & (fire_df['total_acres'] <= 1000), 'fire_severity'] = 3
fire_df.loc[(fire_df['total_acres'] > 1000) & (fire_df['total_acres'] <= 10000), 'fire_severity'] = 4
fire_df.loc[fire_df['total_acres'] > 10000, 'fire_severity'] = 5                                                    

In [11]:
print(fire_df['fire_severity'].value_counts())

1.0    31989
2.0     1131
3.0      404
4.0      135
5.0       96
Name: fire_severity, dtype: int64


#  Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [13]:
fire_binary_encoded = pd.get_dummies(fire_df, columns=["fuel_model", "general_cause"])
fire_binary_encoded.head()

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,county,report_date,total_acres,Latitude,...,general_cause_Arson,general_cause_Debris Burning,general_cause_Equipment Use,general_cause_Juveniles,general_cause_Lightning,general_cause_Miscellaneous,general_cause_Railroad,general_cause_Recreationist,general_cause_Smoking,general_cause_Under Invest
0,2021,51 - Tillamook,Tillamook,1,GRAVEL PIT,T2S R8W S13 SESW,Tillamook,7/7/21 9:09,0.01,45.388867,...,0,0,0,0,0,0,0,1,0,0
1,2021,51 - Tillamook,Tillamook,11,101 on 101,T6S R10W S7 NENE,Tillamook,7/16/21 11:32,0.1,45.072694,...,0,0,1,0,0,0,0,0,0,0
2,2021,51 - Tillamook,Tillamook,23,Lost Creek,T3N R8W S28 SESE,Tillamook,7/31/21 17:19,0.1,45.707217,...,0,0,0,0,1,0,0,0,0,0
3,2021,51 - Tillamook,Tillamook,26,Larson Creek Fire,T1S R11W S24 NENE,Tillamook,8/4/21 22:50,0.59,45.4765,...,0,0,0,0,0,0,0,1,0,0
4,2021,51 - Tillamook,Tillamook,30,Cedar Butte,T1N R8W S14 SWSE,Tillamook,8/11/21 2:48,74.0,45.566447,...,0,0,0,0,0,0,0,0,0,1


In [14]:
fire_binary_encoded = fire_binary_encoded.drop(['fueltype_num', 'generalCause_num','district', 'unit', "fire_name", "legal", "report_date","fire_number", "county", "total_acres" ], axis=1)

In [15]:
fire_binary_encoded.head()

Unnamed: 0,fire_year,Latitude,Longitude,fire_severity,fuel_model_A,fuel_model_B,fuel_model_C,fuel_model_F,fuel_model_G,fuel_model_H,...,general_cause_Arson,general_cause_Debris Burning,general_cause_Equipment Use,general_cause_Juveniles,general_cause_Lightning,general_cause_Miscellaneous,general_cause_Railroad,general_cause_Recreationist,general_cause_Smoking,general_cause_Under Invest
0,2021,45.388867,-123.617297,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021,45.072694,-123.947889,1.0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,2021,45.707217,-123.663167,1.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2021,45.4765,-123.9688,1.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,2021,45.566447,-123.627667,2.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# Define the features set.
X = fire_binary_encoded
X = X.drop("fire_severity", axis=1)
X.head()

Unnamed: 0,fire_year,Latitude,Longitude,fuel_model_A,fuel_model_B,fuel_model_C,fuel_model_F,fuel_model_G,fuel_model_H,fuel_model_I,...,general_cause_Arson,general_cause_Debris Burning,general_cause_Equipment Use,general_cause_Juveniles,general_cause_Lightning,general_cause_Miscellaneous,general_cause_Railroad,general_cause_Recreationist,general_cause_Smoking,general_cause_Under Invest
0,2021,45.388867,-123.617297,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2021,45.072694,-123.947889,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2021,45.707217,-123.663167,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2021,45.4765,-123.9688,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,2021,45.566447,-123.627667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
# Define the target set.
y = fire_binary_encoded["fire_severity"].ravel()
y[:5]

array([1., 1., 1., 1., 2.])

In [18]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [19]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=78) 

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
predictions = rf_model.predict(X_test_scaled)

In [23]:
predictions

array([1., 1., 1., ..., 1., 1., 1.])

In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.17133767, 0.34927838, 0.36509354, 0.00886146, 0.0014257 ,
       0.00732571, 0.00412093, 0.00398976, 0.00710081, 0.003049  ,
       0.00487469, 0.00278169, 0.00556823, 0.00276958, 0.00425636,
       0.00276631, 0.00237595, 0.00461601, 0.00777806, 0.00953614,
       0.00343768, 0.00798013, 0.00492428, 0.00182248, 0.00594233,
       0.00301456, 0.00397254])

In [25]:
# Features Sorted by Importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.36509353897786395, 'Longitude'),
 (0.3492783821564697, 'Latitude'),
 (0.17133767444408768, 'fire_year'),
 (0.009536142081751162, 'general_cause_Equipment Use'),
 (0.008861459139353744, 'fuel_model_A'),
 (0.007980134152730583, 'general_cause_Lightning'),
 (0.007778061939562793, 'general_cause_Debris Burning'),
 (0.007325711075424436, 'fuel_model_C'),
 (0.007100813538484868, 'fuel_model_H'),
 (0.005942331954871164, 'general_cause_Recreationist'),
 (0.005568234702926596, 'fuel_model_L'),
 (0.004924277080402656, 'general_cause_Miscellaneous'),
 (0.004874688818963913, 'fuel_model_J'),
 (0.004616012956922642, 'general_cause_Arson'),
 (0.004256359148098974, 'fuel_model_T'),
 (0.004120926427645409, 'fuel_model_F'),
 (0.003989763037764615, 'fuel_model_G'),
 (0.003972536938134869, 'general_cause_Under Invest'),
 (0.0034376759639094478, 'general_cause_Juveniles'),
 (0.003049000914440736, 'fuel_model_I'),
 (0.0030145645232402024, 'general_cause_Smoking'),
 (0.002781694788289683, 'fuel_model_K'

In [26]:
# Legend For Fuel Models
#A	Annual grasses (cheat)
#B	Dense Chaparral
#C	Open pine, grass under
#F	Dense Brush (lighter than B)
#G	Conifer, Old growth
#H	Conifer, Second growth
#I	Slash, heavy
#J	Slash, medium
#K	Slash, thinning, P.C., Scattrd
#L	Grass Perennial
#R	Hardwood, summer
#T	Sagebrush, medium dense
#U	Closed canopy pine
#X	Non wildland fuel

In [27]:
# Attempting Conusion Matrix to predict fire severity 
cm = confusion_matrix(y_test, predictions)
# DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 1", "Actual 2","Actual 3","Actual 4","Actual 5"], columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5"])
cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,7961,12,11,4,1
Actual 2,272,12,0,0,0
Actual 3,92,5,3,0,0
Actual 4,39,1,1,1,1
Actual 5,23,0,0,0,0


In [28]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5
Actual 1,7961,12,11,4,1
Actual 2,272,12,0,0,0
Actual 3,92,5,3,0,0
Actual 4,39,1,1,1,1
Actual 5,23,0,0,0,0


Accuracy Score : 0.9452541770351938
Classification Report
              precision    recall  f1-score   support

         1.0       0.95      1.00      0.97      7989
         2.0       0.40      0.04      0.08       284
         3.0       0.20      0.03      0.05       100
         4.0       0.20      0.02      0.04        43
         5.0       0.00      0.00      0.00        23

    accuracy                           0.95      8439
   macro avg       0.35      0.22      0.23      8439
weighted avg       0.92      0.95      0.92      8439

