In [62]:
#Import some basic packages
import numpy as np
import pandas as pd

In [63]:
#Import sklearn packages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [None]:
#Let's import the data and instanciate a train/test split with the seed we fixed.

housing = pd.read_csv('../data/cal_fire_data_cleaned.csv')

from sklearn.model_selection import train_test_split
housing_train,housing_test = train_test_split(housing, test_size=.2, random_state=216, shuffle=True, stratify=housing['Destroyed'])



In [None]:
#Let's do a Random Forest Test

X = housing.drop(columns='Destroyed')
y = housing['Destroyed']

categorical_cols = X.select_dtypes(include='object').columns.tolist()

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.difference(['Destroyed'])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('scaler', StandardScaler(), numeric_cols)
    ],
    remainder='passthrough'  
)

rfpipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=216))
])

param_grid = {
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 4, 6, 8, 10, 12],
    'classifier__min_samples_leaf': [1, 2, 4, 8]
}

X_train = housing_train.drop(columns='Destroyed')
y_train = housing_train['Destroyed']

X_test = housing_test.drop(columns='Destroyed')
y_test = housing_test['Destroyed']


grid_search_rf = GridSearchCV(rfpipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("best_params:", grid_search_rf.best_params_)
y_rfpred = grid_search_rf.best_estimator_.predict(X_test)
print(classification_report(y_test, y_rfpred))


Fitting 5 folds for each of 120 candidates, totalling 600 fits
best_params: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      5115
           1       0.91      0.81      0.86      3443

    accuracy                           0.89      8558
   macro avg       0.90      0.88      0.88      8558
weighted avg       0.89      0.89      0.89      8558



In [71]:
print('The confusion matrix is \n', confusion_matrix(housing_test['Destroyed'].values, y_rfpred))
print('The accuracy score is', accuracy_score(housing_test['Destroyed'].values, y_rfpred), '\n')
print('The precision score is', precision_score(housing_test['Destroyed'].values, y_rfpred), '\n')
print('The recall score is', recall_score(housing_test['Destroyed'].values, y_rfpred), '\n')
print('The f1 score is', f1_score(housing_test['Destroyed'].values, y_rfpred), '\n')

The confusion matrix is 
 [[4840  275]
 [ 657 2786]]
The accuracy score is 0.8910960504790839 

The precision score is 0.9101600784057498 

The recall score is 0.8091780424048794 

The f1 score is 0.8567035670356704 



In [67]:
from IPython.display import display
feature_df = pd.DataFrame({'feature': np.concat([pd.get_dummies(X, X.select_dtypes('object').columns).columns[4:].values, pd.get_dummies(X, X.select_dtypes('object').columns).columns[:4].values]),
                            'importance': grid_search_rf.best_estimator_['classifier'].feature_importances_})
display(feature_df.sort_values(by=['importance'], ascending=False))

Unnamed: 0,feature,importance
56,Longitude,0.261485
57,Age,0.173182
55,Latitude,0.070566
29,* Exterior Siding_Combustible,0.067628
36,* Exterior Siding_Wood,0.0625
54,Assessed Improved Value (parcel),0.062289
33,* Exterior Siding_Stucco Brick Cement,0.038158
30,* Exterior Siding_Ignition Resistant,0.033121
20,* Roof Construction_Tile,0.011848
26,"* Vent Screen_Mesh Screen > 1/8""""",0.011103


In [75]:
#Let's do the same but with Extra Trees model and compare.

X = housing.drop(columns='Destroyed')
y = housing['Destroyed']

categorical_cols = X.select_dtypes(include='object').columns.tolist()

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.difference(['Destroyed'])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('scaler', StandardScaler(), numeric_cols)
    ],
    remainder='passthrough'  
)

etpipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(n_estimators=100, random_state=216))
])

param_grid = {
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 4, 6, 8, 10, 12],
    'classifier__min_samples_leaf': [1, 2, 4, 8]
}

X_train = housing_train.drop(columns='Destroyed')
y_train = housing_train['Destroyed']

X_test = housing_test.drop(columns='Destroyed')
y_test = housing_test['Destroyed']


grid_search_et = GridSearchCV(etpipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search_et.fit(X_train, y_train)

print("best_params:", grid_search_et.best_params_)
y_etpred = grid_search_et.best_estimator_.predict(X_test)
print(classification_report(y_test, y_etpred))



Fitting 5 folds for each of 120 candidates, totalling 600 fits
best_params: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 8}
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      5115
           1       0.88      0.77      0.82      3443

    accuracy                           0.87      8558
   macro avg       0.87      0.85      0.86      8558
weighted avg       0.87      0.87      0.86      8558



In [76]:
print('The confusion matrix is \n', confusion_matrix(housing_test['Destroyed'].values, y_etpred))
print('The accuracy score is', accuracy_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The precision score is', precision_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The recall score is', recall_score(housing_test['Destroyed'].values, y_etpred), '\n')
print('The f1 score is', f1_score(housing_test['Destroyed'].values, y_etpred), '\n')


The confusion matrix is 
 [[4754  361]
 [ 794 2649]]
The accuracy score is 0.8650385604113111 

The precision score is 0.8800664451827243 

The recall score is 0.7693871623584083 

The f1 score is 0.8210134821013482 



In [77]:
feature_df = pd.DataFrame({'feature': np.concat([pd.get_dummies(X, X.select_dtypes('object').columns).columns[4:].values, pd.get_dummies(X, X.select_dtypes('object').columns).columns[:4].values]),
                            'importance': grid_search_et.best_estimator_['classifier'].feature_importances_})
display(feature_df.sort_values(by=['importance'], ascending=False))

Unnamed: 0,feature,importance
29,* Exterior Siding_Combustible,0.155839
56,Longitude,0.131632
36,* Exterior Siding_Wood,0.101106
30,* Exterior Siding_Ignition Resistant,0.097496
57,Age,0.095909
33,* Exterior Siding_Stucco Brick Cement,0.068969
54,Assessed Improved Value (parcel),0.026986
55,Latitude,0.021876
6,* Structure Type_Mobile Home Double Wide,0.019988
20,* Roof Construction_Tile,0.017337


It's clear that Random Forest offers better performance.

Let's do some cross validation to test if we can increase performance by removing features.

In [None]:
from sklearn.model_selection import StratifiedKFold
n_splits = 5
kfold = StratifiedKFold(n_splits,
                            shuffle=True,
                            random_state=216)

y = housing_train['Destroyed']

avg_Accs = []
for c in housing.columns[:-1]:   
    X = housing_train.drop(columns=['Destroyed',c])

    categorical_cols = X.select_dtypes(include='object').columns.tolist()

    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.difference(['Destroyed'])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('scaler', StandardScaler(), numeric_cols)
        ],
        remainder='passthrough'  
    )

    rfpipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=8, random_state=216))
    ])

    param_grid = {
        'classifier__criterion': ['gini'],
        'classifier__max_depth': [5, 10, 15, 20, None],
        'classifier__min_samples_split': [2, 4, 6, 8, 10, 12],
        'classifier__min_samples_leaf': [1, 2, 4, 8]
    }

    accuracies = []

    for train_index, test_index in kfold.split(X,y):
        X_tt = X.iloc[train_index]
        y_tt = y.iloc[train_index]
        X_ho = X.iloc[test_index]
        y_ho = y.iloc[test_index]
        
        grid_search_rf = GridSearchCV(rfpipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
        grid_search_rf.fit(X_tt, y_tt)

        y_rfpred = grid_search_rf.best_estimator_.predict(X_ho)
        accuracies += [accuracy_score(y_ho, y_rfpred)]
    print('The avg accuracy when removing column', c, 'is', np.mean(accuracies),'\n')
    avg_Accs += [np.mean(accuracies)]

print('The maximum accuracy is', np.max(avg_Accs), 'and is attained when we remove column', housing.columns[np.argmax(avg_Accs)])

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
The avg accuracy when removing column * Structure Type is 0.8856884337393914 

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
The avg accuracy when removing column * Roof Construction is 0.8823874450282864 

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for each of 120 candidates, totalling 600 fits
Fitting 5 folds for 