Imports

In [41]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import os

Make sure we are in the right directory

In [42]:
# os.chdir('Documents/Deep-Learning-Chronicles/air_quality_prediction/')
# os.getcwd()

Load the data

In [43]:
df = pd.read_csv("smoke_occurence_nairobi.csv")
df.head()

Unnamed: 0,id,time_opening_windows,time_closing_windows,smoke_detected,time_sensing_smoke,duration,date,day,occassion,weather,type_of_smoke
0,1,1641,1717,,1717.0,,2025-10-03,Friday,nothing,cloudywithoutwind,stove
1,2,1252,1528,,1728.0,,2025-10-04,Saturday,,cloudywithoutwind,stove
2,3,1452,1734,,1734.0,,2025-10-05,Sunday,,windy,
3,4,1035,1759,,1759.0,,2025-10-06,,,windy,stove
4,5,1400,1813,True,1813.0,,2025-10-07,Tuesday,,,


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    20 non-null     int64  
 1   time_opening_windows  20 non-null     int64  
 2   time_closing_windows  20 non-null     int64  
 3   smoke_detected        14 non-null     object 
 4   time_sensing_smoke    8 non-null      float64
 5   duration              12 non-null     float64
 6   date                  20 non-null     object 
 7   day                   19 non-null     object 
 8   occassion             12 non-null     object 
 9   weather               16 non-null     object 
 10  type_of_smoke         6 non-null      object 
dtypes: float64(2), int64(3), object(6)
memory usage: 1.8+ KB


In [51]:
df.describe()

Unnamed: 0,id,time_opening_windows,time_closing_windows,smoke_detected,time_sensing_smoke,duration,window_start_min,window_end_min,window_length_min
count,20.0,20.0,20.0,20.0,8.0,12.0,20.0,20.0,20.0
mean,10.5,1213.75,1625.5,0.1,1652.875,0.833333,739.75,985.5,245.75
std,5.91608,175.860553,137.070669,0.307794,196.247975,2.886751,106.304367,82.64668,114.360867
min,1.0,934.0,1224.0,0.0,1224.0,0.0,574.0,744.0,25.0
25%,5.75,1089.75,1527.0,0.0,1662.75,0.0,659.75,927.0,180.0
50%,10.5,1179.5,1625.5,0.0,1731.0,0.0,719.5,985.5,263.0
75%,15.25,1318.25,1730.25,0.0,1750.75,0.0,798.25,1050.25,313.0
max,20.0,1641.0,1813.0,1.0,1813.0,10.0,1001.0,1093.0,444.0


Basic cleaning

In [45]:
df["smoke_detected"] = (
    df["smoke_detected"]
    .fillna(False)
    .astype(str)
    .str.lower()
    .map({"true": 1, "false": 0})
)

df["smoke_detected"].value_counts()

  .fillna(False)


smoke_detected
0    18
1     2
Name: count, dtype: int64

In [46]:
def time_to_minutes(t):
    if pd.isna(t) or t == "":
        return np.nan
    t = str(t).zfill(4)
    final = int(t[:2]) * 60 + int(t[2:])
    return final

df["window_start_min"] = df["time_opening_windows"].apply(time_to_minutes)
df["window_end_min"] = df["time_closing_windows"].apply(time_to_minutes)
df["window_length_min"] = df["window_end_min"] - df["window_start_min"]

df[["window_start_min", "window_end_min", "window_length_min"]].head()


Unnamed: 0,window_start_min,window_end_min,window_length_min
0,1001,1037,36
1,772,928,156
2,892,1054,162
3,635,1079,444
4,840,1093,253


Clean the categorical columns

In [47]:
categorical_cols = ["day", "weather", "occassion"]

for col in categorical_cols:
    df[col] = df[col].fillna("unknown").str.lower()


In [50]:
df[['smoke_detected']].describe()

Unnamed: 0,smoke_detected
count,20.0
mean,0.1
std,0.307794
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


#### Feature Selection
For now we do not need these fields: **id, date, time_sensing_smoke, duration, type_of_smoke**

According to my research, the above fields are too descriptive for a classification problem (version 1).

But we could surely use these fields as inputs:
- window_start_min
- window_end_min
- window_length_min
- day
- weather
- occassion

Target would be:
- smoke_detected

In [52]:
# Training pipeline

feature_cols = [
    "window_start_min",
    "window_end_min",
    "window_length_min",
    "day",
    "weather",
    "occassion"
]

X = df[feature_cols]
y = df["smoke_detected"]


In [53]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=41,
    stratify=y
)


In [54]:
# preprocessing and model pipeline 

numeric_features = [
    "window_start_min",
    "window_end_min",
    "window_length_min"
]

categorical_features = [
    "day",
    "weather",
    "occassion"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

model = LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])


In [57]:
# train the model
pipeline.fit(X_train, y_train)

In [58]:
# evaluate the model
y_pred = pipeline.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[3 2]
 [1 0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.00      0.00      0.00         1

    accuracy                           0.50         6
   macro avg       0.38      0.30      0.33         6
weighted avg       0.62      0.50      0.56         6



Findings:  
- with the current data, the model cannot confidently generalize smoke patterns yet.
(this is due to scarcity of the data, but we can fix that with time)

- Initial model performance is constrained by limited positive samples, motivating continued data collection and iterative retraining.

### What can we do next?

We can try switching from ```pipeline.predict()``` to ```pipeline.predict_proba()```

Because the former forces a **binary decision** while the latter allows:
- early warning thresholds 
- risk tuning 
- fewer missed events later 


In [69]:
X_test

Unnamed: 0,window_start_min,window_end_min,window_length_min,day,weather,occassion
13,668,957,289,monday,rainy,unknown
18,619,990,371,saturday,sunny,holiday
0,1001,1037,36,friday,cloudywithoutwind,nothing
2,892,1054,162,sunday,windy,unknown
3,635,1079,444,unknown,windy,unknown
4,840,1093,253,tuesday,unknown,unknown


In [60]:
y_proba = pipeline.predict_proba(X_test)[:, 1]

for prob, actual in zip(y_proba, y_test):
    print(f"Predicted smoke probability: {prob:.2f} | Actual: {actual}")


Predicted smoke probability: 0.00 | Actual: 0
Predicted smoke probability: 0.00 | Actual: 0
Predicted smoke probability: 1.00 | Actual: 0
Predicted smoke probability: 1.00 | Actual: 0
Predicted smoke probability: 0.00 | Actual: 0
Predicted smoke probability: 0.00 | Actual: 1


In [61]:
threshold = 0.25
y_pred_custom = (y_proba >= threshold).astype(int)

print(confusion_matrix(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom))


[[3 2]
 [1 0]]
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.00      0.00      0.00         1

    accuracy                           0.50         6
   macro avg       0.38      0.30      0.33         6
weighted avg       0.62      0.50      0.56         6



### **Conclusion**

The system is designed to prefer false alarms, but current model confidence is limited by sparse positive data (limited positive samples).

As additional data is collected, the model’s confidence and recall are expected to improve.

## **Turning this to a deployable artifact**

For now anything the web app needs must live inside ONE pipeline object. 

Good thing, that's the case ```Pipeline(preprocess -> model)``` 

First, we save and reload the trained pipeline 

In [62]:
import joblib

joblib.dump(pipeline, "smoke_prediction_pipeline_v1.joblib")


['smoke_prediction_pipeline_v1.joblib']

In [63]:
loaded_pipeline = joblib.load("smoke_prediction_pipeline_v1.joblib")

# quick test
sample = X.iloc[[0]]
loaded_pipeline.predict(sample), loaded_pipeline.predict_proba(sample)


(array([1]), array([[0., 1.]]))

Backend will be receiving something like: 
```
{
  "window_start_min": 660,
  "window_end_min": 1080,
  "window_length_min": 420,
  "day": "tuesday",
  "weather": "windy",
  "occassion": "holiday"
}
```

In [67]:
# prediction helper 

def predict_smoke_risk(input_dict, threshold=0.25):
    """
    input_dict: dict with feature values
    returns: probability + binary alert
    """
    df_input = pd.DataFrame([input_dict])
    
    prob = pipeline.predict_proba(df_input)[0][1]
    alert = int(prob >= threshold)
    
    return {
        "smoke_probability": round(float(prob), 3),
        "alert": alert
    }

# API-ready

In [80]:
# example usage 

example_input = {
    "window_start_min": 1020,   # 5pm
    "window_end_min": 1200,     # 8pm
    "window_length_min": 180,
    "day": "friday",
    "weather": "windy",
    "occassion": "holiday"
}

predict_smoke_risk(example_input)

# output will look like this
# {
#   "smoke_probability": 0.23,
#   "alert": 0
# }



{'smoke_probability': 1.0, 'alert': 1}

In [1]:
import sklearn
print(sklearn.__version__)

1.6.1
