# PREDICTION MODEL

## Importing Libraries and Reading Data

In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [64]:
# to ignore all unnecasarry warnings in the future

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the data
dataset1 = pd.read_csv('../dataset_kaggle/datatest.csv') # testing, door mostly closed
dataset2 = pd.read_csv('../dataset_kaggle/datatest2.csv') # testing, door mostly open
dataset3 = pd.read_csv('../dataset_kaggle/datatraining.csv') # training

In [3]:
print("dimensions of dataset 1 :",dataset1.shape)
print("dimensions of dataset 2 :",dataset2.shape)
print("dimensions of dataset 3 :",dataset3.shape)

dimensions of dataset 1 : (2665, 8)
dimensions of dataset 2 : (9752, 8)
dimensions of dataset 3 : (8143, 8)


## Pre-processing Data


In [4]:

dataset1.head(5)

Unnamed: 0.1,Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,140,02/02/2015 14:19,23.7,26.272,585.2,749.2,0.004764,1
1,141,02/02/2015 14:19,23.718,26.29,578.4,760.4,0.004773,1
2,142,02/02/2015 14:21,23.73,26.23,572.666667,769.666667,0.004765,1
3,143,02/02/2015 14:22,23.7225,26.125,493.75,774.75,0.004744,1
4,144,02/02/2015 14:23,23.754,26.2,488.6,779.0,0.004767,1


In [5]:
# only run it once after restarting the kernal, it drops the useless "unnamed" comlumn

dataset1 = dataset1.drop(dataset1.columns[[0]], axis =1 )
dataset2 = dataset2.drop(dataset2.columns[[0]], axis =1 )
dataset3 = dataset3.drop(dataset3.columns[[0]], axis =1 )

In [6]:
dataset1.head(2)

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,02/02/2015 14:19,23.7,26.272,585.2,749.2,0.004764,1
1,02/02/2015 14:19,23.718,26.29,578.4,760.4,0.004773,1


In [7]:
d1_ts = dataset1.set_index(['date'], drop=True)
d2_ts = dataset2.set_index(['date'], drop=True)
d3_ts = dataset3.set_index(['date'], drop=True)

In [8]:
d1_ts.head()

Unnamed: 0_level_0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
02/02/2015 14:19,23.7,26.272,585.2,749.2,0.004764,1
02/02/2015 14:19,23.718,26.29,578.4,760.4,0.004773,1
02/02/2015 14:21,23.73,26.23,572.666667,769.666667,0.004765,1
02/02/2015 14:22,23.7225,26.125,493.75,774.75,0.004744,1
02/02/2015 14:23,23.754,26.2,488.6,779.0,0.004767,1


## Concatinating / Combining Datasets

In [9]:
datasets = [dataset1, dataset2, dataset3]

data = pd.concat(datasets)

In [10]:
data.shape


(20560, 7)

## Data-Analysis

In [11]:
# checking for null values in dataset

data.isnull().sum()

date             0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
dtype: int64

In [12]:
# checking datatypes
data.dtypes

date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object

In [13]:
data.describe()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,20560.0,20560.0,20560.0,20560.0,20560.0,20560.0
mean,20.906212,27.655925,130.756622,690.553276,0.004228,0.231031
std,1.055315,4.982154,210.430875,311.201281,0.000768,0.421503
min,19.0,16.745,0.0,412.75,0.002674,0.0
25%,20.2,24.5,0.0,460.0,0.003719,0.0
50%,20.7,27.29,0.0,565.416667,0.004292,0.0
75%,21.525,31.29,301.0,804.666667,0.004832,0.0
max,24.408333,39.5,1697.25,2076.5,0.006476,1.0


In [14]:
# convert date column to date data type

data['date'] = pd.to_datetime(data['date'])
data.dtypes

date             datetime64[ns]
Temperature             float64
Humidity                float64
Light                   float64
CO2                     float64
HumidityRatio           float64
Occupancy                 int64
dtype: object

In [15]:
data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,2015-02-02 14:19:00,23.718,26.29,578.4,760.4,0.004773,1
2,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
3,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
4,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [16]:
# sort the dataset by the date column values

data_sorted = data.sort_values(by='date')
data_sorted.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-02 14:19:00,23.7,26.272,585.2,749.2,0.004764,1
1,2015-02-02 14:19:00,23.718,26.29,578.4,760.4,0.004773,1
2,2015-02-02 14:21:00,23.73,26.23,572.666667,769.666667,0.004765,1
3,2015-02-02 14:22:00,23.7225,26.125,493.75,774.75,0.004744,1
4,2015-02-02 14:23:00,23.754,26.2,488.6,779.0,0.004767,1


In [17]:
# reset index

data_idx = data_sorted.reset_index(drop=True)

In [18]:
# checking the index values

data_idx.index

RangeIndex(start=0, stop=20560, step=1)

In [19]:
# saving the final dataset to csv

data_idx.to_csv('../dataset_kaggle/concatenated.csv')

# WE HAVE SAVED THE PROCESSED DATA NOW

In [20]:
# environmental measures average values by room occupancy

data_idx.groupby('Occupancy').mean()

Unnamed: 0_level_0,Temperature,Humidity,Light,CO2,HumidityRatio
Occupancy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,20.58483,27.529654,25.237893,604.996614,0.00412
1,21.975909,28.076209,481.96738,975.32187,0.004589


### NOTES

- in average, the temperature when the room is occupied is 1.5ºC higher than
for non-occupied conditions

- in average, the humidity when the room is occupied is 0.5% higher than
for non-occupied conditions

- in average, the light when the room is occupied is around 19x higher than
for non-occupied conditions

- in average, the C02 concentration when the room is occupied is 1.6x higher
than for non-occupied conditions



## NOW WE SPLIT THE DATA INTO TRAINING AND TESTING SET 

In [21]:
# separate the dataframe into predictor and response variables

# X has predictor variables which effect the occuopancy level
# y has occupancy rate, a variable which is to be predicted

X = data_idx.drop(['Occupancy','date'], axis =1) # features
y = data_idx['Occupancy'] # target

In [22]:
# splitting the data to train and test

from sklearn.model_selection import train_test_split

training_data_ratio = 0.8
X_train, X_test, y_train, y_test = train_test_split (X, y, train_size = training_data_ratio, random_state = 29 )

In [23]:
X_train.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
7355,22.1,34.545,429.0,1495.0,0.005694
8615,20.39,22.4725,0.0,433.75,0.003323
7080,21.675,32.7225,450.25,1382.25,0.005252
7314,22.29,34.59,439.0,1480.666667,0.005769
12194,20.39,21.29,0.0,443.666667,0.003147


In [24]:
y_train.head()

7355     1
8615     0
7080     1
7314     1
12194    0
Name: Occupancy, dtype: int64

### LOGISTIC REGRESSION

In [25]:
# load logistic regression
from sklearn.linear_model import LogisticRegression

# initialize the model
lr = LogisticRegression(solver='lbfgs')
# (there are many solver functions for logistic regression, lgfgs is one of them which is used for 
#  multiclass case, as in, multiple prediction attribures)

# fit the model to the training data
lr.fit(X_train, y_train)

In [26]:
# accuracy score for training
acc_lr = lr.score(X_train, y_train) * 100

print(f"Logistic Regression Train Accuracy {round(acc_lr, 4)}%")

Logistic Regression Train Accuracy 98.8996%


In [30]:
# predict the response for the test dataset
y_pred_lr = lr.predict(X_test)


In [31]:
# create confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

confusion_matrix(y_test, y_pred_lr)

array([[3073,   40],
       [   4,  995]])

Analyzing the results:
there are 3073 + 995 correct predictions and 4 + 40 incorrect predictions

- C0,0 - the count of true negatives
- C1,0 - false negatives
- C1,1 - true positives
- C0,1 - false positives

In [33]:
# print accuracy_score for the testing
acc_lr_test = accuracy_score(y_test, y_pred_lr) * 100

print(f"Logistic Regression Test Accuracy {round(acc_lr_test, 2)}%")

Logistic Regression Test Accuracy 98.93%


In [39]:
X_test.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio
7331,22.2,34.4475,444.0,1506.0,0.005713
18103,20.29,32.933333,0.0,460.0,0.004851
12064,20.89,19.89,0.0,467.0,0.003031
17251,21.29,33.195,479.0,1432.75,0.005203
11148,20.89,23.39,0.0,448.0,0.003568


In [62]:

def prediction_lr_model(Temperature, Humidity, Light, CO2, HumidityRatio):
    
    Temperature = float(Temperature)
    Humidity = float(Humidity)
    Light = float(Light)
    CO2 = float(CO2)
    HumidityRatio = float(HumidityRatio)
    
    prediction_score = lr.predict([[Temperature, Humidity, Light, CO2, HumidityRatio]])
    
    if prediction_score == 1:
        x = "There are people present in the room"
    else :
        x = "No one is present in the room"
    
    return x
    


In [65]:
Temperature = input()
Humidity = input()
Light = input()
CO2 = input()
HumidityRatio = input()

prediction_lr_model(Temperature, Humidity, Light, CO2, HumidityRatio)

20.89
23.390000
0.0
448.00
0.003568


'No one is present in the room'

### DECISION TREES

In [66]:
# load decision tree
from sklearn.tree import DecisionTreeClassifier

# initialize the model
dtc = DecisionTreeClassifier()

# fit the model to the training data
dtc.fit(X_train, y_train)

In [67]:
# accuracy score for training
acc_dtc = dtc.score(X_train, y_train) * 100

print(f"Decision Tree Train Accuracy {round(acc_dtc, 2)}%")

Decision Tree Train Accuracy 100.0%


In [68]:
# predict the response for the test dataset
y_pred_dtc = dtc.predict(X_test)

# create confusion matrix
confusion_matrix(y_test, y_pred_dtc)


array([[3093,   20],
       [  16,  983]])

In [69]:
# print accuracy_score for the testing
acc_dtc_test = accuracy_score(y_test, y_pred_dtc) * 100

print(f"Decision Tree Test Accuracy {round(acc_dtc_test, 2)}%")


Decision Tree Test Accuracy 99.12%


In [71]:
def prediction_dtc_model(Temperature, Humidity, Light, CO2, HumidityRatio):
    
    Temperature = float(Temperature)
    Humidity = float(Humidity)
    Light = float(Light)
    CO2 = float(CO2)
    HumidityRatio = float(HumidityRatio)
    
    prediction_score = dtc.predict([[Temperature, Humidity, Light, CO2, HumidityRatio]])
    
    if prediction_score == 1:
        x = "There are people present in the room"
    else :
        x = "No one is present in the room"
    
    return x

In [72]:
Temperature = input()
Humidity = input()
Light = input()
CO2 = input()
HumidityRatio = input()

prediction_dtc_model(Temperature, Humidity, Light, CO2, HumidityRatio)

20.89
23.390000
0.0
448.00
0.003568


'No one is present in the room'

# THE ACCURACY OF 
# DECISION TREE MODEL > LOGISTIC REGRESSION MODEL 