In [None]:
!pip install human-learn

# Machine Learning

In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [15]:
# Define the Train Dataset and Test Dataset
# train -> Pandas DataFrame
# test  -> Pandas DataFrame
# Data Attributes
# Based on the x-axis attributes try to predict whether the Room is "Occupant" or "Not Occupant"
'''
    - "date",                   -- DROP IT
    - "Temperature",            - x-axis attribute
    - "Humidity",               - x-axis attribute
    - "Light",                  - x-axis attribute
    - "CO2",                    - x-axis attribute
    - "HumidityRatio",          - x-axis attribute

    - "Occupancy"               - y-axis attribute
'''
train = pd.read_csv("resources/occupancy_data/datatraining.txt").drop(columns="date")
test = pd.read_csv("resources/occupancy_data/datatest.txt").drop(columns="date")

In [None]:
# From the Train data source define the x-axis and y-axis
# x_axis = {temperature, humidity, light, CO2, HumidityRation}
# y_axis = {occupancy}  -- this is our TARGET label to predict for the dataset in x_axis with certain accuracy

# train_X (Pandas DF)--> Datasource: Train; x-axis data set        --> n-dimension
# train_Y (Pandas DF)--> Datasource: Train; y-axis data set        --> 1-d // label to predict
target = "Occupancy"
train_X, train_Y= train.drop(columns=target),train[target]
test_X, test_Y = test.drop(columns=target), test[target]


'''
(a) We will train our ML model with the train_X and train_Y dataset
(b) Then we will ask the Model to predict the Label for test_X dataset
(c) Finally, we will compare the predicted Label with the test_X's original label: test_Y
'''

In [25]:
# For checking if ok
type(train_X)
type(train_Y)

# get the first 10 rows from train_X dataset
# get the first 10 rows from train-Y dataset
train_X.head(10)
train_Y.head(10)

# get the first 10 rows of test_X and test_Y dataset
test_X.head(10)
#test_Y.head(10)

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
140,23.7,26.272,585.2,749.2,0.004764
141,23.718,26.29,578.4,760.4,0.004773
142,23.73,26.23,572.666667,769.666667,0.004765
143,23.7225,26.125,493.75,774.75,0.004744
144,23.754,26.2,488.6,779.0,0.004767
145,23.76,26.26,568.666667,790.0,0.004779
146,23.73,26.29,536.333333,798.0,0.004776
147,23.754,26.29,509.0,797.0,0.004783
148,23.754,26.35,476.0,803.2,0.004794
149,23.736,26.39,510.0,809.0,0.004796


In [74]:
# Using Machine Learning Model - RandomForestClassifier
# Random Forest Classifier = Collection of Decision Trees --> CART (Classification and Regression Tree) Model
# Its not a Rule Based ML solution
# Instead we will ask the Model to generate the Rule by itself with the given X-dataset and Y-dataset


# Where to Split the Tree?
# why random_state parameter is used?
# it controls the randomness of the estimator - to make it more deterministic
# Decision Trees use heuristics process. DT dont guarantee the same solution globally. There will be variations in the tree structure each time you build a model. Passing a specific seed to <random_state> ensures the same result is generated each time you build the model.

# Randomness --> GINI--> Impurity in the value of a dataset


forest_model = RandomForestClassifier(random_state=1)   # Define the ML model which will be used
forest_model.fit(train_X,train_Y)  # Train the Machine Learning model with Training DataSet X and Y
machine_predicted_Y = forest_model.predict(test_X);    # let the ML model predict the Lable of Testing dataset Text-X
# print the classification report on the Prediction accuracy w.r.t. test_Y
print(classification_report(test_Y, machine_predicted_Y))

# Check the f1-score - Measures a model's accuracy
'''
f1-score = 2*[(precision * recall)/(precision+recall)]

'''

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1693
           1       0.95      0.92      0.93       972

    accuracy                           0.95      2665
   macro avg       0.95      0.95      0.95      2665
weighted avg       0.95      0.95      0.95      2665



'\nf1-score = 2*[(precision * recall)/(precision+recall)]\n\n'

# Rule-Based Model
Rule 01: If Light> Threshold, then Room is Occupant; Else Not

In [29]:
import plotly.express as px
import plotly.graph_objects as go

## Hypothesis: Rooms with Low Light has lower probability of being Occupant

In [38]:
# Task-01: BoxPlot the Training Dataset to see their Light Median
# We will be using Plotly Low Code
# boxplot using the Training dataset
# x-axis-> Occupancy  y-axis-> Light
feature = "Light"
fig = px.box(data_frame=train,x=target,y=feature)
fig.show()

# From the plot we could see a significant different in Light Median been an Occupant Room and Empty Room.
# The Empty rooms having Light Median almost ZERO
# When the Occupant rooms have light medium around 400-600
# the median is the simple average of the n/2 -th and the (n/2 + 1) -th terms.

In [76]:
import numpy as np
from hulearn.classification import  FunctionClassifier
# Define the Human Generated Rules ;
# i.e. I set the rule from my own knowledge base

# Rule 01: Light > Threshold, Room --> Occupant; Else Not
# Create the function
'''
(a) data --> of type 'Pandas DataFrame'
(b) Col --> Light
(c) Threshold --> 100
'''
def rule_light_gt_threshold(data: pd.DataFrame, col: str, threshold: float):
    return np.array(data[col] > threshold).astype(int)
# Set the initial threshold = 100
threshold = 100

# create the rule named 'rule_light_gt_threshold'
# This rule will act as a classifier for us
# See, we didnt use any ML model as out classifier
# Rather, we used human-learn function classifier
rule_01_model = FunctionClassifier(rule_light_gt_threshold, col= feature, threshold=threshold)

# Train this model with Training Data Sets: Train_X and Train_Y
rule_01_model.fit(train_X,train_Y)

# Predict the Y-label from testing dataset
human_predicted_Y= rule_01_model.predict(test_X)    # this returns a nd-array

# compare the predicted_Y label with the actual Y-label test_Y and look for accuracy
print(classification_report(test_Y,human_predicted_Y))

'''
see f1-score increased to 98%, means the my_rule based classier is predicting better than RansomForestClassier
'''



              precision    recall  f1-score   support

           0       1.00      0.95      0.98      1693
           1       0.93      1.00      0.96       972

    accuracy                           0.97      2665
   macro avg       0.96      0.98      0.97      2665
weighted avg       0.97      0.97      0.97      2665



'\nsee f1-score increased to 98%, means the my_rule based classier is predicting better than RansomForestClassier\n'

# Common Plot Function to Plot Threshold

In [67]:
# To plot the Threshold as overly in the Plotly BoxPlot
def plot_threshold(train_df: pd.DataFrame, feature: str, target: str, threshold: float):
    fig = px.box(data_frame=train_df, x=target, y=feature)

    # add a second axis that overlays the existing one
    fig.layout.xaxis2 = go.layout.XAxis(
        overlaying="x", range=[0, 2], showticklabels=False
    )
    fig.add_scatter(
        x=[0, 2],
        y=[threshold, threshold],
        mode="lines",
        xaxis="x2",
        showlegend=False,
        line=dict(dash="dash", color="firebrick", width=2),
    )

    fig.show()

# Plot the Threshold in Dataset

In [77]:
plot_threshold(train, feature, target, threshold)

# Let's Try to improve the result i.e. accuracy
Earlie we choose Light =100 to be our threshold. But is this the Best Threshold?
Let's try to find the best Threshold

In [61]:
from sklearn.model_selection import GridSearchCV
# Find the best threshold between 250 to 750 using GridSearchCV
# np.linespace(start,stop,total_samples_requesting)
grid_model = GridSearchCV(rule_01_model,cv=2, param_grid={"threshold": np.linspace(250,750,1000)})
grid_model.fit(train_X,train_Y)

GridSearchCV(cv=2,
             estimator=FunctionClassifier(col='Light',
                                          func=<function rule_light_gt_threshold at 0x7f861407c310>,
                                          threshold=100),
             param_grid={'threshold': array([150.        , 150.6006006 , 151.2012012 , 151.8018018 ,
       152.4024024 , 153.003003  , 153.6036036 , 154.2042042 ,
       154.8048048 , 155.40540541, 156.00600601, 156.60660661,
       157.20720721, 157.80780781, 158.40840841, 159.00900901...
       733.78378378, 734.38438438, 734.98498498, 735.58558559,
       736.18618619, 736.78678679, 737.38738739, 737.98798799,
       738.58858859, 739.18918919, 739.78978979, 740.39039039,
       740.99099099, 741.59159159, 742.19219219, 742.79279279,
       743.39339339, 743.99399399, 744.59459459, 745.1951952 ,
       745.7957958 , 746.3963964 , 746.996997  , 747.5975976 ,
       748.1981982 , 748.7987988 , 749.3993994 , 750.        ])})

In [62]:
best_threshold = grid_model.best_params_["threshold"]
best_threshold

365.015015015015

In [78]:
grid_human_predicted_Y = grid_model.predict(test_X)
print(classification_report(test_Y, grid_human_predicted_Y))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      1693
           1       0.95      1.00      0.97       972

    accuracy                           0.98      2665
   macro avg       0.97      0.98      0.98      2665
weighted avg       0.98      0.98      0.98      2665



In [79]:
# Plot the Best Threshold in the Training Dataset
plot_threshold(train, feature, target, best_threshold)

# Recap
So far we have derived the label 'Occupant' using
- ml_predicted_Y --> using RandomForestClassifier with threshold::Light=100
- human_predicted_Y --> Using human rule based classifier with threshold::Light=100
- grid_human_predicted_Y --> Using grid on the top of human rule based classifier with Best Threshold seleted by the Grid to improve f1-score and accuracy.


# Combining ML based and Human Rule Based Classifier together

In [106]:
def highlight_cell(row):
    return [
        "background-color: red; color: white"
        if cell == 0
        else "background-color: green; color: white"
        for cell in row
    ]

In [84]:
# Form a dictionary using Pandas DataFrame
predicted_label_dict = {
    "RandomForestClassifier": machine_predicted_Y,  # machine_predicted_Y is a nd-array
    "Rule-based Model": grid_human_predicted_Y      # grid_human_predicted_Y is a nd-array
}

# Converting the dict into a Panda's DF
comparison = pd.DataFrame(predicted_label_dict)

# type(comparison)
# comparison["RandomForestClassifier"].head(10)
# comparison["Rule-based Model"].head(10)

In [96]:
# Find the cells where the ML predicted_Y and grid_human predicted_Y didnt coverge
# difference --> is another pd.DF
difference = comparison[
    comparison["RandomForestClassifier"]!=comparison["Rule-based Model"]
]
difference.head(10)

Unnamed: 0,RandomForestClassifier,Rule-based Model
0,0,1
1,0,1
2,0,1
5,0,1
6,0,1
225,1,0
1038,0,1
1039,0,1
1040,0,1
1339,0,1


In [104]:
# Draft
# my_dict={
#     'col1': [1,2,3],
#     'col2': [1,3,4]
# }
# comp= pd.DataFrame(my_dict)
# diff =comp
# comp[comp["col1"]!=comp["col2"]]
# diff
# diff.assign(final_prediction=1)


### Reduce False Negative
---
        P(1)               N(0)
    ------------|------------------
  P(1)    TP               FP
    ------------|------------------
  N(0)    FN               TN
    ------------------------------

In [107]:
# To reduce False Negative -- choose positive labels when two models disagree
# To reduce False Positive -- choose Negative labels when two models disagree
reduce_false_negative = difference.assign(final_prediction=1)
reduce_false_negative.style.apply(highlight_cell)


Unnamed: 0,RandomForestClassifier,Rule-based Model,final_prediction
0,0,1,1
1,0,1,1
2,0,1,1
5,0,1,1
6,0,1,1
225,1,0,1
1038,0,1,1
1039,0,1,1
1040,0,1,1
1339,0,1,1


In [108]:
# Reduce False Positive
reduce_false_positives = difference.assign(final_prediction=0)
reduce_false_positives.style.apply(highlight_cell)

Unnamed: 0,RandomForestClassifier,Rule-based Model,final_prediction
0,0,1,0
1,0,1,0
2,0,1,0
5,0,1,0
6,0,1,0
225,1,0,0
1038,0,1,0
1039,0,1,0
1040,0,1,0
1339,0,1,0
