In [None]:
import pandas as pd
import numpy as np
from matplotlib import pylab as plt
from sklearn import linear_model, model_selection, metrics, random_projection
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier

### Preparing data for machine learning model

In [None]:
# Getting data ready for models
cord_all_clean = pd.read_csv('dash_clean.csv')
model_firedata = cord_all_clean
model_firedata['stat_cause_code'] = model_firedata['stat_cause_code'].astype(int)

# filtering out fires with missing cause
model_firedata = model_firedata[model_firedata['stat_cause_code'] != 13]

# Categorize cause --> 1 is caused by nature, 2 is caused by human
model_firedata['human_caused'] = 1
model_firedata.loc[model_firedata['stat_cause_code'] > 1 , 'human_caused'] = 2

# classify the fire size
model_firedata.loc[model_firedata['fire_size'] <= 2.5, 'fire_severity'] = 1
model_firedata.loc[(model_firedata['fire_size'] > 2.5) & (model_firedata['fire_size'] <= 100), 'fire_severity'] = 2
model_firedata.loc[model_firedata['fire_size'] > 100, 'fire_severity'] = 3

model_firedata = model_firedata[model_firedata['D3'].notnull()]

### Decide on features and label
Features are latitude, longitude, month the fire happened, year the fire happened, drought, and the acres burned level of the fire.
Label is if the fire is caused by human related or nature related reason.

In [None]:
label_severity = model_firedata['human_caused']
train_cols = ['latitude', 'longitude', 'fire_month', 'fire_year','D3', 'fire_severity']
train_firedata = model_firedata[train_cols]

### Train test split and SMOTE

85% of wildfires are caused by human related reasons, resulting in an imbalanced dataset for making predictions. Here we use a technique called SMOTE (Synthetic Minority Over-sampling Technique) to deal with the imbalanced data.

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = model_selection.train_test_split(train_firedata, label_severity, test_size= 0.50, stratify = label_severity)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio='auto', kind='regular', random_state=42)
smox, smoy = smote.fit_sample(x_train, y_train)

### Use GridSearch to find the best parameters

In [None]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

In [None]:
param_grid = {'learning_rate': [0.05, 0.1, 0.01, 0.02],
              'max_depth': [4,6],
              'min_samples_leaf': [3,5,9,17],
              'max_features': [1, 0.3, 0.1]}

est = GradientBoostingClassifier(n_estimators = 3000)
gs_cv = GridSearchCV(est, param_grid).fit(x_train, y_train)

In [None]:
grid_search.best_params_

# Random Forest
'''
{'bootstrap': True,
 'max_depth': 110,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}
'''

# Gradient Boosting Tree
'''
{'learning_rate': 0.01,
 'max_depth': 6,
 'max_features': 0.3,
 'min_samples_leaf': 9}
'''

### Construct Random Fores/ Gradient Boosting Tree Models

#### Randome Forest

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators = 100, 
                           bootstrap = True, 
                           max_depth = 35, 
                           max_features = 3, 
                           min_samples_leaf = 5,
                           min_samples_split = 12)

In [None]:
# Fit the model with data treated by SMOTE and predict.
clf.fit(smox, smoy)
ypred = clf.predict(x_test)
print(classification_report(y_test, ypred))

#### Graadient Boosting Regression Tree

In [None]:
gbrt = GradientBoostingClassifier(n_estimators = 3000, 
                                 learning_rate = 0.01, 
                                 max_depth = 6, 
                                 max_features = 0.3, 
                                 min_samples_leaf = 9)

In [None]:
gbrt.fit(smox, smoy)
ypred = gbrt.predict(x_test)
print(classification_report(y_test, ypred))