In [None]:
#Import some basic packages
import numpy as np
import pandas as pd

In [None]:
#Import sklearn packages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [None]:
#Let's import the data and instanciate a train/test split with the seed we fixed.

housing = pd.read_csv('../data/cal_fire_data_cleaned.csv')

from sklearn.model_selection import train_test_split
housing_train,housing_test = train_test_split(housing, test_size=.2, random_state=216, shuffle=True, stratify=housing['Destroyed'])



We're trying an Extra Trees Classifier model.


In [None]:
#Let's do the same but with Extra Trees model and compare.

X = housing.drop(columns='Destroyed')
y = housing['Destroyed']

categorical_cols = X.select_dtypes(include='object').columns.tolist()

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.difference(['Destroyed'])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('scaler', StandardScaler(), numeric_cols)
    ],
    remainder='passthrough'  
)

etpipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(n_estimators=100, random_state=216))
])

param_grid = {
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 4, 6, 8, 10, 12],
    'classifier__min_samples_leaf': [1, 2, 4, 8]
}

X_train = housing_train.drop(columns='Destroyed')
y_train = housing_train['Destroyed']

X_test = housing_test.drop(columns='Destroyed')
y_test = housing_test['Destroyed']


grid_search_et = GridSearchCV(etpipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_et.fit(X_train, y_train)

print("best_params:", grid_search_et.best_params_)
y_etpred = grid_search_et.best_estimator_.predict(X_test)
print(classification_report(y_test, y_etpred))



In [None]:
print('The confusion matrix is \n', confusion_matrix(housing_test['Destroyed'].values, y_etpred))
print('The accuracy score is', accuracy_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The precision score is', precision_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The recall score is', recall_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The f1 score is', f1_score(housing_test['Destroyed'].values, y_etpred), '\n')


It's clear that the regular Random Forest Classifier offers much better performance, so we won't investigate this model further.

In [None]:
feature_df = pd.DataFrame({'feature': np.concat([pd.get_dummies(X, X.select_dtypes('object').columns).columns[4:].values, pd.get_dummies(X, X.select_dtypes('object').columns).columns[:4].values]),
                            'importance': grid_search_et.best_estimator_['classifier'].feature_importances_})
display(feature_df.sort_values(by=['importance'], ascending=False))