In [None]:
!pip install --pre pycaret

# 1. Acoustic Extinguisher Fire Dataset

In [None]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import average_precision_score, accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pickle

RANDOM_STATE = 30

In [None]:
# Levantamos el dataset
df = pd.read_excel('Acoustic_Extinguisher_Fire_Dataset.xlsx')

In [None]:
# Visualizamos los primeros 5 registros
df.head()

Unnamed: 0,SIZE,FUEL,DISTANCE,DESIBEL,AIRFLOW,FREQUENCY,STATUS
0,1,gasoline,10,96,0.0,75,0
1,1,gasoline,10,96,0.0,72,1
2,1,gasoline,10,96,2.6,70,1
3,1,gasoline,10,96,3.2,68,1
4,1,gasoline,10,109,4.5,67,1


### Pre-procesamiento de los datos

#### 1. Renombrar columnas

In [None]:
columns = {
    'SIZE': 'size',
    'FUEL': 'fuel',
    'DISTANCE': 'distance',
    'DESIBEL': 'desibel',
    'AIRFLOW': 'airflow',
    'FREQUENCY': 'frequency',
    'STATUS': 'status',
}

df.rename(columns=columns, inplace=True)

#### 2. División de las variables predictoras (X) y variable objetivo (y)

In [None]:
# Definición de X e Y
X = df[['size', 'fuel', 'distance', 'desibel', 'airflow', 'frequency']]
y = df[['status']]

#### 3. Transformación de datos y pipeline de pre-procesamiento

In [None]:
numeric_features = ['size', 'distance', 'desibel', 'airflow', 'frequency']
categorical_features = ['fuel']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#### 4. Separación en datos de entrenamiento y testeo

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
print('Composición del training set:')
print(y_train.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
status
0         6131
1         6078
dtype: int64

Composición del test set:
status
0         2628
1         2605
dtype: int64


### Preparación de la experimentación

#### 1. Definición del modelo

In [None]:
decision_tree_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=30))])

#### 2. Definición de las métricas

In [None]:
def metric_report(y_test, y_pred, y_proba):  
    print(classification_report(y_test, y_pred))  
    print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
    precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
    print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

#### 3. Entrenamiento del modelo

In [None]:
decision_tree_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

#### 4. Análisis del resultado obtenido

In [None]:
y_pred = decision_tree_model.predict(X_test)

In [None]:
y_proba = decision_tree_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.957
Area bajo la curva Precision-Recall: 0.969


In [None]:
RandomForestClassifier_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', DecisionTreeClassifier(criterion='gini', max_depth=30))])

In [None]:
RandomForestClassifier_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:
y_pred = RandomForestClassifier_model.predict(X_test)

In [None]:
y_proba = RandomForestClassifier_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9581
Area bajo la curva Precision-Recall: 0.9695


In [None]:
KNeighborsClassifier_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', KNeighborsClassifier())])


In [None]:
KNeighborsClassifier_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:
y_pred = KNeighborsClassifier_model.predict(X_test)

In [None]:
y_proba = KNeighborsClassifier_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9917
Area bajo la curva Precision-Recall: 0.9928


In [None]:
LogisticRegression_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression ())])



In [None]:
LogisticRegression_model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [None]:
y_pred = LogisticRegression_model.predict(X_test)

In [None]:
y_proba = LogisticRegression_model.predict_proba(X_test)

In [None]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2628
           1       0.91      0.89      0.90      2605

    accuracy                           0.90      5233
   macro avg       0.90      0.90      0.90      5233
weighted avg       0.90      0.90      0.90      5233

Area bajo la curva ROC: 0.9665
Area bajo la curva Precision-Recall: 0.9678


In [None]:
from pycaret.classification import (
    add_metric,
    setup,
    compare_models,
    models,
)

In [None]:
clf1 = setup(
    df,
    target="status",
    train_size=0.7,
    # preprocess = False,
    normalize=True,
    normalize_method="zscore",
    pca=True,
    pca_components=0.9,
    # # remove_outliers = True,
    # # outliters_threshold = 0.05,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    fix_imbalance=True,  # SMOTE method
    data_split_shuffle=True,
    data_split_stratify=True,
    fold_strategy="stratifiedkfold",
    fold=5,
    session_id=RANDOM_STATE,
    ## silent=False,
    log_experiment=False,
    experiment_name="Fire-extinguishing",
    use_gpu=False,
)

Unnamed: 0,Description,Value
0,Session id,30
1,Target,status
2,Target type,Binary
3,Original data shape,"(17442, 7)"
4,Transformed data shape,"(17495, 6)"
5,Transformed train set shape,"(12262, 6)"
6,Transformed test set shape,"(5233, 6)"
7,Numeric features,5
8,Categorical features,1
9,Preprocess,True


In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [None]:
add_metric("averagePrecision", "Average Precision", average_precision_score)

Name                                                 Average Precision
Display Name                                         Average Precision
Score Function       <function average_precision_score at 0x7f2469d...
Scorer                            make_scorer(average_precision_score)
Target                                                            pred
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: averagePrecision, dtype: object

In [None]:
best_model = compare_models(
    include=[
        "lr",  # Logistic Regression
        "knn",  # K-Nearest Neighbors
        "dt",  # Decision Tree Classifier
        "svm",  # SVM - Linear Kernel
        "rbfsvm",  # SVM - Radial Kernel
        "rf",  # Random Forest Classifier
        # RandomForestClassifier(n_estimators=10, random_state=RANDOM_STATE),
    ],
    fold=5,
    cross_validation=True,
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision,TT (Sec)
rf,Random Forest Classifier,0.9485,0.99,0.9431,0.953,0.948,0.897,0.8971,0.9271,1.576
knn,K Neighbors Classifier,0.9441,0.9821,0.939,0.9484,0.9436,0.8883,0.8884,0.9209,0.328
dt,Decision Tree Classifier,0.9314,0.9314,0.9297,0.9324,0.931,0.8629,0.8629,0.9018,0.21
rbfsvm,SVM - Radial Kernel,0.9286,0.9841,0.9246,0.9315,0.928,0.8571,0.8573,0.8988,6.622
lr,Logistic Regression,0.8719,0.9523,0.858,0.8816,0.8696,0.7438,0.7441,0.8271,0.964
svm,SVM - Linear Kernel,0.8667,0.0,0.8636,0.8684,0.8657,0.7333,0.7337,0.8176,0.148


Processing:   0%|          | 0/29 [00:00<?, ?it/s]

### Vuelvo a transformar de datos y creo pipeline de pre-procesamiento

In [None]:
numeric_features = ['size', 'distance', 'desibel', 'airflow', 'frequency']
categorical_features = ['fuel']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Composición del training set:")
print(y_train.value_counts())
print("\nComposición del test set:")
print(y_test.value_counts())

Composición del training set:
status
0         6131
1         6078
dtype: int64

Composición del test set:
status
0         2628
1         2605
dtype: int64


In [None]:
## scaler = StandardScaler()
## X_train_sc = scaler.fit_transform(X_train)
## X_test_sc = scaler.transform(X_test)

In [None]:
#classifiers = [
#    [RandomForestClassifier(), "Random Forest"],
#    [KNeighborsClassifier(), "K-Nearest Neighbours"],
#    [DecisionTreeClassifier(), "Decision Tree Classifier"],
#    [LogisticRegression(), "Logistic Regression"],
#]

#for models in classifiers:
#    model = models[0]
#    model.fit(X_train_sc, y_train)

#    y_pred = model.predict(X_test_sc)
#    y_proba = model.predict_proba(X_test_sc)

#    result_modelos_clasif(models, X_test_sc, y_test, y_pred, y_proba)