In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
from sklearn.utils import shuffle

Dataset credits: Yam Peleg, “Room Occupancy Detection" Kaggle, 2021, https://archive-beta.ics.uci.edu/ml/datasets/occupancy+detection.

In [2]:
#Get the data in datatraining.txt and dataset2.txt in df_training and df_validation dataframes, respectively
df_training = pd.read_csv("../input/room-occupancy-detection/datatraining.txt")
df_validation = pd.read_csv("../input/room-occupancy-detection/datatest.txt")
df_test = pd.read_csv("../input/room-occupancy-detection/datatest2.txt")
df_training = shuffle(df_training)
df_validation = shuffle(df_validation)
df_test = shuffle(df_test)
#The 'date' column is not needed to predict the room occupancy
df_features = ['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']

In [3]:
#Training data
X_train = df_training[df_features]
y_train = df_training.Occupancy
#Validation data
X_val = df_validation[df_features]
y_val = df_validation.Occupancy
#Test data
X_test = df_test[df_features]
y_test = df_test.Occupancy

Here, I have used two models - Decision Tree and Random Forests for the room occupancy detection problem

In [4]:
#Solving the room occupancy detection problem using a decision tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
rod_model = DecisionTreeRegressor(random_state=1)

# Fit model
rod_model.fit(X_train, y_train)

Finding the depth and the number of nodes of the decision tree we created gives us some insight into its structure

In [5]:
#Using the model's tree_ object to get the model's depth and node count
print("Depth of the decision tree = ", rod_model.tree_.max_depth)
print("Number of leaf nodes in the decision tree = ", rod_model.tree_.n_leaves)

In [6]:
#Calculating mean absolute error on the validation dataset
y_val_predictions = rod_model.predict(X_val)
val_mae = mean_absolute_error(y_val, y_val_predictions)
print("Mean absolute error on the validation dataset =", val_mae)

Experimenting with decision trees having different number of maximum leaf nodes gives us the most optimal tree.

In [7]:
max_leaf_nodes_list = [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]
mae_list = []
mae_val_list = []
for max_leaf_nodes_num in max_leaf_nodes_list:
    rod_model_1 = DecisionTreeRegressor(random_state=0, max_leaf_nodes=max_leaf_nodes_num)
    rod_model_1.fit(X_train, y_train)
    mae = mean_absolute_error(y_train, rod_model_1.predict(X_train))
    mae_2 = mean_absolute_error(y_val, rod_model_1.predict(X_val))
    mae_list.append(mae)
    mae_val_list.append(mae_2)
best_max_leaf_nodes_num = max_leaf_nodes_list[np.argmin(mae_val_list)]
print("The best value of max_leaf_nodes = ", best_max_leaf_nodes_num)

In [8]:
#Calculating the MAE on the test dataset with max_leaf_nodes = 40
rod_model_2 = DecisionTreeRegressor(random_state=1, max_leaf_nodes = 40)
rod_model_2.fit(X_train, y_train)
y_test_pred = rod_model_2.predict(X_test)
test_mae_2 = mean_absolute_error(y_test, y_test_pred)
print("MAE on the test dataset with max_leaf_nodes = 40 is :", test_mae_2)

Training and validation MAE vs different values of max_leaf_nodes for training and validation datasets

In [9]:
#Maximum of the MAE for both the training and validation dataset is less than 0.11
plt.ylim(0, 0.11)
plt.xlabel('max_leaf_nodes')
plt.ylabel('MAE')
plt.title('Training and validation MAE for different values of max_leaf_nodes')
plt.plot(max_leaf_nodes_list, mae_list, color='r',label = 'Training')
plt.plot(max_leaf_nodes_list, mae_val_list, color='b',label = 'Validation')
plt.legend()
plt.show()

In [10]:
#Solving the room occupancy detection problem using a random forest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(rf_model.predict(X_val), y_val)
print(" MAE on the validation dataset = ", rf_val_mae)

The default number of estimators in a random forest is 100. Creating random forests with different number of estimators and checking the MAE on the validation dataset gives us the best number of estimators. 

In [11]:
num_estimators_list = [75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140]
rf_mae_list = []
rf_mae_val_list = []
for num_estimators in num_estimators_list:
    rf_model_1 = RandomForestRegressor(random_state=0, n_estimators=num_estimators)
    rf_model_1.fit(X_train, y_train)
    mae = mean_absolute_error(y_train, rf_model_1.predict(X_train))
    mae_2 = mean_absolute_error(y_val, rf_model_1.predict(X_val))
    rf_mae_list.append(mae)
    rf_mae_val_list.append(mae_2)
best_num_estimators = num_estimators_list[np.argmin(rf_mae_val_list)]
print("The best value of the number of estimators = ", best_num_estimators)

Training and validation MAE vs different values of max_leaf_nodes for training and validation datasets(random forest)

In [12]:
plt.title('Training and validation MAE for different values of n_estimators')
plt.plot(num_estimators_list, rf_mae_list, color='r', label = 'Training')
plt.plot(num_estimators_list, rf_mae_val_list, color='b', label = 'Validation')
plt.legend()
plt.show()

In [13]:
#Using the best value of the number of estimators to find the MAE on the test dataset
rf_model_2 = RandomForestRegressor(random_state=1, n_estimators = 80)
rf_model_2.fit(X_train, y_train)

# Calculate the mean absolute error of your Random Forest model on the validation data
y_test_pred_2 = rf_model_2.predict(X_test)
rf_val_mae_1 = mean_absolute_error(y_test_pred_2, y_test)
print("MAE on the validation dataset with n_estimators=80 : ", rf_val_mae_1)

In [14]:
#Rounding off the values in the predictions to their nearest integer
y_test_pred = [round(a) for a in y_test_pred]
y_test_pred_2 = [round(a) for a in y_test_pred_2]
df_1 = pd.DataFrame(y_test_pred)
df_2 = pd.DataFrame(y_test_pred_2)
df_1.to_csv('y_test_pred.csv')
df_2.to_csv('y_test_pred_2.csv')