# 1. Acoustic Extinguisher Fire Dataset

In [26]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import pickle
import joblib

In [2]:
# Levantamos el dataset
df = pd.read_excel('data/Acoustic_Extinguisher_Fire_Dataset.xlsx')

In [3]:
# Visualizamos los primeros 5 registros
df.head()

Unnamed: 0,SIZE,FUEL,DISTANCE,DESIBEL,AIRFLOW,FREQUENCY,STATUS
0,1,gasoline,10,96,0.0,75,0
1,1,gasoline,10,96,0.0,72,1
2,1,gasoline,10,96,2.6,70,1
3,1,gasoline,10,96,3.2,68,1
4,1,gasoline,10,109,4.5,67,1


### Pre-procesamiento de los datos

#### 1. Renombrar columnas

In [4]:
columns = {
    'SIZE': 'size',
    'FUEL': 'fuel',
    'DISTANCE': 'distance',
    'DESIBEL': 'desibel',
    'AIRFLOW': 'airflow',
    'FREQUENCY': 'frequency',
    'STATUS': 'status',
}

df.rename(columns=columns, inplace=True)

#### 2. División de las variables predictoras (X) y variable objetivo (y)

In [5]:
# Definición de X e Y
X = df[['size', 'fuel', 'distance', 'desibel', 'airflow', 'frequency']]
y = df[['status']]

#### 3. Transformación de datos y pipeline de pre-procesamiento

In [6]:
numeric_features = ['size', 'distance', 'desibel', 'airflow', 'frequency']
categorical_features = ['fuel']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#### 4. Separación en datos de entrenamiento y testeo

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [8]:
print('Composición del training set:')
print(y_train.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
status
0         6131
1         6078
dtype: int64

Composición del test set:
status
0         2628
1         2605
dtype: int64


### Preparación de la experimentación

#### 1. Definición del experimento

In [9]:
random_forest_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', RandomForestClassifier())])

In [10]:
param_grid = {
    'classifier__max_depth': [15, 30, 45],
    'classifier__n_estimators': [50, 100, 150],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
}

In [11]:
search = GridSearchCV(random_forest_pipe, param_grid, cv=5, n_jobs=-1, verbose=3)

#### 2. Definición de las métricas

In [12]:
def metric_report(y_test, y_pred, y_proba):  
    print(classification_report(y_test, y_pred))  
    print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
    precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
    print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

### Entrenamiento del modelo

In [13]:
search.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 27 candidates, totalling 135 fits


### Análisis del resultado obtenido

In [21]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,6.767012,0.109254,0.062502,0.009882,gini,15,50,"{'classifier__criterion': 'gini', 'classifier_...",0.962735,0.958231,0.97543,0.966421,0.965998,0.965763,0.005654,25
1,13.899997,0.142043,0.125003,0.009883,gini,15,100,"{'classifier__criterion': 'gini', 'classifier_...",0.961916,0.962326,0.974611,0.96683,0.965588,0.966254,0.004579,23
2,20.667855,0.407593,0.165626,0.015934,gini,15,150,"{'classifier__criterion': 'gini', 'classifier_...",0.965602,0.962735,0.973382,0.965602,0.966407,0.966746,0.003546,20
3,6.84109,0.172164,0.062501,0.009883,gini,30,50,"{'classifier__criterion': 'gini', 'classifier_...",0.961507,0.961916,0.972973,0.964373,0.966407,0.965435,0.004164,27
4,13.415856,0.375054,0.131249,0.007652,gini,30,100,"{'classifier__criterion': 'gini', 'classifier_...",0.964373,0.962735,0.972154,0.964783,0.967636,0.966336,0.00331,22
5,20.759638,0.268269,0.16875,0.018222,gini,30,150,"{'classifier__criterion': 'gini', 'classifier_...",0.964783,0.959459,0.971744,0.966011,0.966407,0.965681,0.003922,26
6,6.929847,0.259117,0.068751,0.007654,gini,45,50,"{'classifier__criterion': 'gini', 'classifier_...",0.962326,0.961507,0.97502,0.964783,0.970094,0.966746,0.005109,19
7,13.9,0.361826,0.131248,0.015933,gini,45,100,"{'classifier__criterion': 'gini', 'classifier_...",0.965602,0.962326,0.970925,0.965192,0.966407,0.966091,0.002782,24
8,21.405835,0.276887,0.190625,0.011693,gini,45,150,"{'classifier__criterion': 'gini', 'classifier_...",0.963554,0.963145,0.972563,0.965602,0.967636,0.9665,0.003429,21
9,6.979001,0.258287,0.071876,0.012499,entropy,15,50,"{'classifier__criterion': 'entropy', 'classifi...",0.962735,0.962326,0.973792,0.96683,0.970914,0.967319,0.004494,15


In [15]:
search.best_estimator_

In [16]:
search.best_score_

0.9691211862862825

In [22]:
y_pred = search.predict(X_test)

In [23]:
y_proba = search.predict_proba(X_test)

In [24]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2628
           1       0.98      0.96      0.97      2605

    accuracy                           0.97      5233
   macro avg       0.97      0.97      0.97      5233
weighted avg       0.97      0.97      0.97      5233

Area bajo la curva ROC: 0.9963
Area bajo la curva Precision-Recall: 0.9964


In [25]:
# save the model
filename = 'models/random_forest_search.sav'
pickle.dump(search, open(filename, 'wb'))

### Conclusión:

Hemos realizado una búsqueda hiperparamétrica sobre el modelo de random forest, con el objetivo de encontrar un modelo que performe de manera superior al de árboles de decisión entrenado previamente. Y hemos hayado que con los parámetros: **max_depth=45, criterion='log_loss' y n_estimators=100**, se obtiene el mejor estimador. 

No obstante, la mejora obtenida es del 1% sobre la métrica f1-score, pero el modelo almacenado ocupa 10 veces más espacio. Por lo que concluimos que si bien el modelo performa mejor, no creemos que valga la pena utilizarlo en lugar del árbol de decisión ya que añade mucha complejidad, perdiendo interpretabilidad, por una performance levemente superior.