# 1. Acoustic Extinguisher Fire Dataset

In [2]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pickle

In [3]:
# Levantamos el dataset
df = pd.read_excel('Acoustic_Extinguisher_Fire_Dataset.xlsx')

In [4]:
# Visualizamos los primeros 5 registros
df.head()

Unnamed: 0,SIZE,FUEL,DISTANCE,DESIBEL,AIRFLOW,FREQUENCY,STATUS
0,1,gasoline,10,96,0.0,75,0
1,1,gasoline,10,96,0.0,72,1
2,1,gasoline,10,96,2.6,70,1
3,1,gasoline,10,96,3.2,68,1
4,1,gasoline,10,109,4.5,67,1


### Pre-procesamiento de los datos

#### 1. Renombrar columnas

In [5]:
columns = {
    'SIZE': 'size',
    'FUEL': 'fuel',
    'DISTANCE': 'distance',
    'DESIBEL': 'desibel',
    'AIRFLOW': 'airflow',
    'FREQUENCY': 'frequency',
    'STATUS': 'status',
}

df.rename(columns=columns, inplace=True)

#### 2. División de las variables predictoras (X) y variable objetivo (y)

In [6]:
# Definición de X e Y
X = df[['size', 'fuel', 'distance', 'desibel', 'airflow', 'frequency']]
y = df[['status']]

#### 3. Transformación de datos y pipeline de pre-procesamiento

In [7]:
numeric_features = ['size', 'distance', 'desibel', 'airflow', 'frequency']
categorical_features = ['fuel']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#### 4. Separación en datos de entrenamiento y testeo

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
print('Composición del training set:')
print(y_train.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
status
0         6131
1         6078
dtype: int64

Composición del test set:
status
0         2628
1         2605
dtype: int64


### Preparación de la experimentación

#### 1. Definición del modelo

In [10]:
decision_tree_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=30))])

#### 2. Definición de las métricas

In [11]:
def metric_report(y_test, y_pred, y_proba):  
    print(classification_report(y_test, y_pred))  
    print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
    precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
    print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

#### 3. Entrenamiento del modelo

In [12]:
decision_tree_model.fit(X_train, y_train)

#### 4. Análisis del resultado obtenido

In [13]:
y_pred = decision_tree_model.predict(X_test)

In [14]:
y_proba = decision_tree_model.predict_proba(X_test)

In [15]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9566
Area bajo la curva Precision-Recall: 0.9684


In [16]:
RandomForestClassifier_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=30))])

In [17]:
RandomForestClassifier_model.fit(X_train, y_train)

In [18]:
y_pred = RandomForestClassifier_model.predict(X_test)

In [19]:
y_proba = RandomForestClassifier_model.predict_proba(X_test)

In [20]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2628
           1       0.96      0.96      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9585
Area bajo la curva Precision-Recall: 0.9692


In [22]:
KNeighborsClassifier_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', KNeighborsClassifier())])


In [23]:
KNeighborsClassifier_model.fit(X_train, y_train)

  return self._fit(X, y)


In [28]:
y_pred = KNeighborsClassifier_model.predict(X_test)

In [29]:
y_proba = KNeighborsClassifier_model.predict_proba(X_test)

In [30]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9917
Area bajo la curva Precision-Recall: 0.9928


In [33]:
LogisticRegression_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression ())])



In [34]:
LogisticRegression_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [35]:
y_pred = LogisticRegression_model.predict(X_test)

In [36]:
y_proba = LogisticRegression_model.predict_proba(X_test)

In [37]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2628
           1       0.91      0.89      0.90      2605

    accuracy                           0.90      5233
   macro avg       0.90      0.90      0.90      5233
weighted avg       0.90      0.90      0.90      5233

Area bajo la curva ROC: 0.9665
Area bajo la curva Precision-Recall: 0.9678
