# Traning Classification model

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Optional: for binary mapping and balancing
from imblearn.over_sampling import SMOTE

In [4]:

# Step 1: Read the CSV file
df = pd.read_csv('Laboratory Fire Dataset containing Multi Sensor Data.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,PM05_Room,PM100_Room,PM10_Room,...,CO2_Room_Trend,CO_Room_Trend,H2_Room_Trend,PM05_Room_Trend,PM10_Room_Trend,PM25_Room_Trend,PM40_Room_Trend,PM100_Room_Trend,PM_Room_Typical_Size_Trend,VOC_Room_RAW_Trend
0,0,0,sensornode0005,568.4,-0.15,0.58,42.44,46,0,7,...,-0.576667,0.933333,-0.96,-0.926667,-0.773333,0.0,0.0,0.0,-0.766667,0.726667
1,1,0,sensornode0006,637.3,-0.12,0.79,42.34,48,0,7,...,-0.876667,0.92,-0.96,-0.936667,-0.756667,0.0,0.0,0.0,0.296667,0.696667
2,2,0,sensornode0007,792.2,-0.19,0.39,42.92,52,0,8,...,0.32,0.906667,-0.826667,-0.966667,-0.826667,0.0,0.0,0.0,0.14,0.583333
3,3,0,sensornode0008,660.7,0.65,0.53,44.31,50,0,7,...,-0.036667,0.96,-0.933333,-0.936667,-0.85,0.0,0.0,0.0,-0.123333,0.896667
4,4,10,sensornode0005,592.5,-0.06,0.38,42.55,48,0,7,...,-0.576667,0.933333,-0.96,-0.926667,-0.773333,0.0,0.0,0.0,-0.766667,0.726667


In [6]:
# Step 2: Define columns to include/ignore based on analysis
# Included inputs (X): Raw sensors + trends (24 columns)
input_columns = [
    'CO2_Room', 'CO_Room', 'H2_Room', 'Humidity_Room', 'PM05_Room', 'PM100_Room',
    'PM10_Room', 'PM25_Room', 'PM40_Room', 'PM_Room_Typical_Size', 'PM_Total_Room',
    'Temperature_Room', 'VOC_Room', 'VOC_Room_RAW',
    'CO2_Room_Trend', 'CO_Room_Trend', 'H2_Room_Trend', 'PM05_Room_Trend',
    'PM10_Room_Trend', 'PM25_Room_Trend', 'PM40_Room_Trend', 'PM100_Room_Trend',
    'PM_Room_Typical_Size_Trend', 'VOC_Room_RAW_Trend'
]

# Ignored: Unnamed: 0, Date, Sensor_ID, scenario_label, number_label, UV_Room, Interval_label, class
# Target (y)
target_column = 'class'

# Step 3: Prepare X and y
X = df[input_columns]  # Select only included columns
y = df[target_column]

In [8]:
X.head()

Unnamed: 0,CO2_Room,CO_Room,H2_Room,Humidity_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,PM40_Room,PM_Room_Typical_Size,...,CO2_Room_Trend,CO_Room_Trend,H2_Room_Trend,PM05_Room_Trend,PM10_Room_Trend,PM25_Room_Trend,PM40_Room_Trend,PM100_Room_Trend,PM_Room_Typical_Size_Trend,VOC_Room_RAW_Trend
0,568.4,-0.15,0.58,42.44,46,0,7,0,0,0.49,...,-0.576667,0.933333,-0.96,-0.926667,-0.773333,0.0,0.0,0.0,-0.766667,0.726667
1,637.3,-0.12,0.79,42.34,48,0,7,0,0,0.49,...,-0.876667,0.92,-0.96,-0.936667,-0.756667,0.0,0.0,0.0,0.296667,0.696667
2,792.2,-0.19,0.39,42.92,52,0,8,0,0,0.54,...,0.32,0.906667,-0.826667,-0.966667,-0.826667,0.0,0.0,0.0,0.14,0.583333
3,660.7,0.65,0.53,44.31,50,0,7,0,0,0.43,...,-0.036667,0.96,-0.933333,-0.936667,-0.85,0.0,0.0,0.0,-0.123333,0.896667
4,592.5,-0.06,0.38,42.55,48,0,7,0,0,0.49,...,-0.576667,0.933333,-0.96,-0.926667,-0.773333,0.0,0.0,0.0,-0.766667,0.726667


In [10]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64

In [11]:
# Check shapes and info
print("X shape:", X.shape)  # Should be (2900, 24)
print("y value counts:\n", y.value_counts())
print("X dtypes:\n", X.dtypes)
print("X nulls:\n", X.isnull().sum())  # Should be 0

X shape: (2900, 24)
y value counts:
 class
1    1651
3     458
2     452
4     339
Name: count, dtype: int64
X dtypes:
 CO2_Room                      float64
CO_Room                       float64
H2_Room                       float64
Humidity_Room                 float64
PM05_Room                       int64
PM100_Room                      int64
PM10_Room                       int64
PM25_Room                       int64
PM40_Room                       int64
PM_Room_Typical_Size          float64
PM_Total_Room                   int64
Temperature_Room              float64
VOC_Room                      float64
VOC_Room_RAW                  float64
CO2_Room_Trend                float64
CO_Room_Trend                 float64
H2_Room_Trend                 float64
PM05_Room_Trend               float64
PM10_Room_Trend               float64
PM25_Room_Trend               float64
PM40_Room_Trend               float64
PM100_Room_Trend              float64
PM_Room_Typical_Size_Trend    float64
VOC_Ro

In [12]:
# Step 4: Split and scale (for ML training)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
X_train_scaled

array([[-0.4605345 , -0.47077613, -0.44063642, ..., -0.22123121,
        -1.31497586, -1.37230314],
       [-0.37527851, -0.40908779, -0.40774704, ..., -0.3915997 ,
        -0.3794755 ,  0.52785361],
       [-0.42616175, -0.47929886, -0.42204677, ..., -0.22123121,
         0.29315699, -1.37230314],
       ...,
       [-0.19192503, -0.55032162, -0.44635632, ..., -0.22123121,
        -0.0934134 , -0.71994518],
       [-0.35567832, -0.3900131 , -0.0717033 , ..., -0.72168867,
         1.13588046, -1.37230314],
       [-0.35365071, -0.40381181, -0.3991672 , ..., -0.3915997 ,
        -0.3794755 ,  0.52785361]])

In [24]:
X_train_scaled.shape

(2320, 24)

## Use of SMOTE
- SMOTE identifies a minority class sample and finds its \(k\)-nearest neighbors from the same minority class.
- It then creates new synthetic data points along the straight line segments connecting the original sample to its neighbors.
- This process is repeated until the number of minority class samples is closer to the number of majority class samples, resulting in a more balanced dataset
- Reduces overfitting and increases accuracy on the minority classes 

In [17]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

In [23]:
X_train_bal.shape

(5284, 24)

## Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')

In [27]:
rf.fit(X_train_bal, y_train_bal)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [71]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_rf = rf.predict(X_test_scaled)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = classification_report(y_test, y_pred_rf, output_dict=True)['weighted avg']['f1-score']

## Xgboost

In [32]:
!pip install -q xgboost --break-system-packages

In [54]:
import numpy as np

np.unique(y_train_bal)

array([1, 2, 3, 4])

In [67]:
import numpy as np

import numpy as np

class XGBoostRelabeler:
    """
    Stateful relabeler for XGBoost multi-class: Forward to 0-based, backward to original.
    Usage: relabeler = XGBoostRelabeler(original_classes=[1,2,3,4])
           y_train = relabeler.forward(y_original)
           y_pred_original = relabeler.backward(model.predict(X))
    """
    def __init__(self, original_classes):
        self.original_classes = np.array(original_classes)
        self.n_classes = len(original_classes)
        if not np.all(np.diff(self.original_classes) == 1):
            raise ValueError("Classes must be sequential integers.")
    
    def forward(self, labels):
        """Relabel to 0-based for training."""
        labels = np.array(labels)
        return labels - self.original_classes[0]  # Subtract min (e.g., 1 → 0)
    
    def backward(self, labels):
        """Reconvert predictions to original scale."""
        labels = np.array(labels)
        return labels + self.original_classes[0]  # Add min (e.g., 0 → 1)
    
    def roundtrip_test(self, labels):
        """Verify: forward + backward == original."""
        fwd = self.forward(labels)
        bwd = self.backward(fwd)
        matches = np.array_equal(bwd, labels)
        print(f"Round-trip matches: {matches}")
        return matches

relabeler = XGBoostRelabeler(original_classes=[1, 2, 3, 4])
y_train_forward = relabeler.forward(y_train_bal) 
np.unique(y_train_forward)

array([0, 1, 2, 3])

In [44]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    random_state=42,
    objective='multi:softprob',  # For multi-class probabilities
    num_class=4,  # Matches your 4 classes
    eval_metric='mlogloss'  # Optional: For multi-class loss tracking
)

In [45]:
xgb_model.fit(X_train_bal, y_train_forward)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [68]:
y_pred_xgb = xgb_model.predict(X_test_scaled)
np.unique(y_pred_xgb)

array([0, 1, 2, 3])

In [69]:
preds_reconverted = relabeler.backward(y_pred_xgb)
np.unique(preds_reconverted)

array([1, 2, 3, 4])

In [72]:
acc_xgb = accuracy_score(y_test, preds_reconverted)
f1_xgb = classification_report(y_test, preds_reconverted, output_dict=True)['weighted avg']['f1-score']

##  Model Comparison

In [73]:
print(f"Random Forest - Accuracy: {acc_rf:.4f}, F1-Score: {f1_rf:.4f}")
print(f"XGBoost - Accuracy: {acc_xgb:.4f}, F1-Score: {f1_xgb:.4f}")
best_model = "XGBoost" if acc_xgb > acc_rf else "Random Forest"
print(f"\nBest Model (by Accuracy): {best_model}")

Random Forest - Accuracy: 0.9707, F1-Score: 0.9708
XGBoost - Accuracy: 0.9793, F1-Score: 0.9793

Best Model (by Accuracy): XGBoost


In [75]:
print("\nRandom Forest:\n", classification_report(y_test, y_pred_rf))
print("\nXGBoost:\n", classification_report(y_test, y_pred_xgb))


Random Forest:
               precision    recall  f1-score   support

           1       0.99      0.97      0.98       330
           2       0.94      0.99      0.96        90
           3       0.95      0.96      0.95        92
           4       0.94      0.96      0.95        68

    accuracy                           0.97       580
   macro avg       0.95      0.97      0.96       580
weighted avg       0.97      0.97      0.97       580


XGBoost:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.03      0.01      0.01       330
           2       0.00      0.00      0.00        90
           3       0.01      0.01      0.01        92
           4       0.00      0.00      0.00        68

    accuracy                           0.01       580
   macro avg       0.01      0.00      0.01       580
weighted avg       0.02      0.01      0.01       580



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Feature Importances (XGBoost)

In [76]:
importances = pd.DataFrame({'feature': input_columns, 'importance': xgb_model.feature_importances_}).sort_values('importance', ascending=False)
print(importances.head(10))

               feature  importance
21    PM100_Room_Trend    0.138774
23  VOC_Room_RAW_Trend    0.111914
17     PM05_Room_Trend    0.089069
13        VOC_Room_RAW    0.079124
15       CO_Room_Trend    0.055406
16       H2_Room_Trend    0.046823
19     PM25_Room_Trend    0.044713
5           PM100_Room    0.039026
0             CO2_Room    0.037159
6            PM10_Room    0.035455
