In [1]:
!pip install --pre pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Acoustic Extinguisher Fire Dataset

In [3]:
# Importamos las librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import average_precision_score, accuracy_score,plot_confusion_matrix,roc_auc_score, classification_report, confusion_matrix, precision_recall_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import pickle

RANDOM_STATE = 30

In [4]:
# Levantamos el dataset
df = pd.read_excel('/content/drive/MyDrive/Aprendizaje de máquina 1/TP Integrador/data/Acoustic_Extinguisher_Fire_Dataset.xlsx')

In [5]:
# Visualizamos los primeros 5 registros
df.head()

Unnamed: 0,SIZE,FUEL,DISTANCE,DESIBEL,AIRFLOW,FREQUENCY,STATUS
0,1,gasoline,10,96,0.0,75,0
1,1,gasoline,10,96,0.0,72,1
2,1,gasoline,10,96,2.6,70,1
3,1,gasoline,10,96,3.2,68,1
4,1,gasoline,10,109,4.5,67,1


### Pre-procesamiento de los datos

#### 1. Renombrar columnas

In [6]:
columns = {
    'SIZE': 'size',
    'FUEL': 'fuel',
    'DISTANCE': 'distance',
    'DESIBEL': 'desibel',
    'AIRFLOW': 'airflow',
    'FREQUENCY': 'frequency',
    'STATUS': 'status',
}

df.rename(columns=columns, inplace=True)

#### 2. División de las variables predictoras (X) y variable objetivo (y)

In [7]:
# Definición de X e Y
X = df[['size', 'fuel', 'distance', 'desibel', 'airflow', 'frequency']]
y = df[['status']]

#### 3. Transformación de datos y pipeline de pre-procesamiento

In [8]:
numeric_features = ['size', 'distance', 'desibel', 'airflow', 'frequency']
categorical_features = ['fuel']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#### 4. Separación en datos de entrenamiento y testeo

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [10]:
print('Composición del training set:')
print(y_train.value_counts())

print('\nComposición del test set:')
print(y_test.value_counts())

Composición del training set:
status
0         6131
1         6078
dtype: int64

Composición del test set:
status
0         2628
1         2605
dtype: int64


### Preparación de la experimentación

#### 1. Definición de los modelos

In [21]:
logistic_regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', LogisticRegression ())])


random_forest_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', RandomForestClassifier(criterion='gini', max_depth=30))])

knn_model = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', KNeighborsClassifier())])

#### 2. Definición de las métricas

In [12]:
def metric_report(y_test, y_pred, y_proba):  
    print(classification_report(y_test, y_pred))  
    print('Area bajo la curva ROC:',np.round(roc_auc_score(y_test, y_proba[:,1]), 4)) 
    precision, recall,threshold=precision_recall_curve(y_test, y_proba[:,1]);
    print('Area bajo la curva Precision-Recall:',np.round(auc(recall, precision), 4))

### Entrenamiento de y evaluación de los modelos

#### 1. Logistic Regression

In [13]:
logistic_regression_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['size', 'distance',
                                                   'desibel', 'airflow',
                                                   'frequency']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['fuel'])])),
                ('classifier', LogisticRegression())])

In [14]:
y_pred = logistic_regression_model.predict(X_test)

In [15]:
y_proba = logistic_regression_model.predict_proba(X_test)

In [16]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2628
           1       0.91      0.89      0.90      2605

    accuracy                           0.90      5233
   macro avg       0.90      0.90      0.90      5233
weighted avg       0.90      0.90      0.90      5233

Area bajo la curva ROC: 0.9665
Area bajo la curva Precision-Recall: 0.9678


#### 2. Random Forest

In [22]:
random_forest_model.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['size', 'distance',
                                                   'desibel', 'airflow',
                                                   'frequency']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['fuel'])])),
                ('classifier', RandomForestClassifier(max_depth=30))])

In [23]:
y_pred = random_forest_model.predict(X_test)

In [24]:
y_proba = random_forest_model.predict_proba(X_test)

In [25]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2628
           1       0.97      0.96      0.97      2605

    accuracy                           0.97      5233
   macro avg       0.97      0.97      0.97      5233
weighted avg       0.97      0.97      0.97      5233

Area bajo la curva ROC: 0.9957
Area bajo la curva Precision-Recall: 0.9959


#### 3. KNN

In [26]:
knn_model.fit(X_train, y_train)

  return self._fit(X, y)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['size', 'distance',
                                                   'desibel', 'airflow',
                                                   'frequency']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['fuel'])])),
                ('classifier', KNeighborsClassifier())])

In [27]:
y_pred = knn_model.predict(X_test)

In [28]:
y_proba = knn_model.predict_proba(X_test)

In [29]:
metric_report(y_test, y_pred, y_proba)

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      2628
           1       0.96      0.95      0.96      2605

    accuracy                           0.96      5233
   macro avg       0.96      0.96      0.96      5233
weighted avg       0.96      0.96      0.96      5233

Area bajo la curva ROC: 0.9917
Area bajo la curva Precision-Recall: 0.9928


## AutoML: PyCaret

In [54]:
from pycaret.classification import (
    add_metric,
    setup,
    compare_models,
    models,
    create_model,
    evaluate_model,
    predict_model,
    save_model,
)

In [31]:
clf1 = setup(
    df,
    target="status",
    train_size=0.7,
    # preprocess = False,
    normalize=True,
    normalize_method="zscore",
    pca=True,
    pca_components=0.9,
    # # remove_outliers = True,
    # # outliters_threshold = 0.05,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    fix_imbalance=True,  # SMOTE method
    data_split_shuffle=True,
    data_split_stratify=True,
    fold_strategy="stratifiedkfold",
    fold=5,
    session_id=RANDOM_STATE,
    ## silent=False,
    log_experiment=False,
    experiment_name="Fire-extinguishing",
    use_gpu=False,
)

Unnamed: 0,Description,Value
0,Session id,30
1,Target,status
2,Target type,Binary
3,Original data shape,"(17442, 7)"
4,Transformed data shape,"(17495, 6)"
5,Transformed train set shape,"(12262, 6)"
6,Transformed test set shape,"(5233, 6)"
7,Numeric features,5
8,Categorical features,1
9,Preprocess,True


In [32]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [33]:
add_metric("averagePrecision", "Average Precision", average_precision_score)

Name                                                 Average Precision
Display Name                                         Average Precision
Score Function       <function average_precision_score at 0x7f45c29...
Scorer                            make_scorer(average_precision_score)
Target                                                            pred
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: averagePrecision, dtype: object

In [34]:
best_model = compare_models(
    include=[
        "lr",  # Logistic Regression
        "knn",  # K-Nearest Neighbors
        "dt",  # Decision Tree Classifier
        "svm",  # SVM - Linear Kernel
        "rbfsvm",  # SVM - Radial Kernel
        "rf",  # Random Forest Classifier
        # RandomForestClassifier(n_estimators=10, random_state=RANDOM_STATE),
    ],
    fold=5,
    cross_validation=True,
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision,TT (Sec)
rf,Random Forest Classifier,0.9486,0.9901,0.9436,0.9527,0.9481,0.8971,0.8972,0.927,1.514
knn,K Neighbors Classifier,0.9445,0.9821,0.9391,0.9489,0.9439,0.8889,0.8891,0.9214,0.522
dt,Decision Tree Classifier,0.9295,0.9295,0.9314,0.9273,0.9293,0.859,0.859,0.8978,0.342
rbfsvm,SVM - Radial Kernel,0.9292,0.9841,0.926,0.9316,0.9287,0.8585,0.8586,0.8994,6.044
lr,Logistic Regression,0.8718,0.9523,0.8578,0.8816,0.8695,0.7436,0.7439,0.827,1.32
svm,SVM - Linear Kernel,0.8676,0.0,0.8506,0.8803,0.8647,0.7352,0.7364,0.8228,0.186


Processing:   0%|          | 0/29 [00:00<?, ?it/s]

In [35]:
print(best_model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=30, verbose=0, warm_start=False)


### Creamos el modelo

In [40]:
rf = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.9525,0.9911,0.9482,0.9561,0.9521,0.905,0.905,0.9323
1,0.9476,0.988,0.9515,0.9437,0.9476,0.8952,0.8952,0.9221
2,0.9427,0.9894,0.9301,0.9536,0.9417,0.8853,0.8856,0.9218
3,0.9505,0.9913,0.949,0.9513,0.9501,0.9009,0.9009,0.9282
4,0.9496,0.9906,0.9391,0.9588,0.9489,0.8992,0.8994,0.9307
Mean,0.9486,0.9901,0.9436,0.9527,0.9481,0.8971,0.8972,0.927
Std,0.0033,0.0012,0.0079,0.0051,0.0035,0.0067,0.0066,0.0044


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [41]:
print(rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=30, verbose=0, warm_start=False)


### Evaluamos el modelo

In [47]:
evaluate_model(rf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [48]:
predict_model(rf);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision
0,Random Forest Classifier,0.9478,0.9916,0.9432,0.9516,0.9474,0.8957,0.8957,0.9258


### Predicciones sobre el conjunto de test

In [51]:
test_df = [X_test, y_test]
test_df = pd.concat(test_df, axis=1)

In [52]:
unseen_predictions = predict_model(rf, data=test_df)
unseen_predictions.head()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Average Precision
0,Random Forest Classifier,0.9838,0.9982,0.9804,0.9869,0.9836,0.9675,0.9675,0.9773


Unnamed: 0,feature 3,feature 4,feature 5,feature 6,feature 7,status,Label,Score
2682,0.22148,-0.401519,0.366076,-0.062317,1.783611,1,1,0.98
1848,0.739539,-0.797999,1.878424,-0.119104,1.445883,0,0,1.0
13928,0.944001,-0.041481,-1.571161,1.645334,-0.490631,0,0,0.99
1391,0.187399,-0.760894,-1.18653,-0.001355,2.129169,1,1,1.0
5515,-0.171728,-1.251,0.617607,-1.548397,-1.092791,0,1,0.98


In [53]:
from pycaret.utils import check_metric
check_metric(unseen_predictions['status'], unseen_predictions['Label'], metric = 'Accuracy')

0.9838

### Guardamos el modelo

In [56]:
save_model(rf,'/content/drive/MyDrive/Aprendizaje de máquina 1/TP Integrador/models/pycaret_rf')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=/tmp/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None, include=['size', 'distance', 'desibel', 'airflow', 'frequency'], transformer=SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=['...
                  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                         class_weight=None, criterion='gini',
                                         max_depth=None, max_features='auto',
                                         max_leaf_nodes=None, max_samples=None,
                                         min_impurity_decrease=0.0,
                                         min_samples_leaf=1, min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                     

### Conclusiones

A partir de la experimentación en el presente notebook, notamos que el algoritmo que arroja las métricas más altas es el Random Forest. Pudimos comprobar esto experimentando manualmente con los modelos vistos en clase y además utilizando la herramienta de AutoML Pycaret. 

Cómo proximos pasos, intentaremos incrementar la performance del modelo de Random Forest, experimentando con diferentes hiper parámetros.