## **CIND 820 - Big Data Analytics Project**

In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols          # ordinary least squares

**Create a data frame of the Room Occupancy dataset. Display the names of the columns. Check the data types of the attributes. Run a summary statistics of the dataset.**

In [2]:
# Read the CSV file into a data frame called Occupancy
Occupancy = pd.read_csv('Occupancy_Estimation.csv')

# Display the first five records of the data frame
Occupancy.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


In [3]:
# Get the column names
print("Column Names:", Occupancy.columns)

Column Names: Index(['Date', 'Time', 'S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
       'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
       'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR',
       'Room_Occupancy_Count'],
      dtype='object')


In [4]:
# Get the data types of each column
print("\nData Types of Each Column:")
print(Occupancy.dtypes)


Data Types of Each Column:
Date                     object
Time                     object
S1_Temp                 float64
S2_Temp                 float64
S3_Temp                 float64
S4_Temp                 float64
S1_Light                  int64
S2_Light                  int64
S3_Light                  int64
S4_Light                  int64
S1_Sound                float64
S2_Sound                float64
S3_Sound                float64
S4_Sound                float64
S5_CO2                    int64
S5_CO2_Slope            float64
S6_PIR                    int64
S7_PIR                    int64
Room_Occupancy_Count      int64
dtype: object


In [5]:
# Get the summary statistics
print("Summary Statistics:")
print(Occupancy.describe())

Summary Statistics:
            S1_Temp       S2_Temp       S3_Temp       S4_Temp      S1_Light  \
count  10129.000000  10129.000000  10129.000000  10129.000000  10129.000000   
mean      25.454012     25.546059     25.056621     25.754125     25.445059   
std        0.351351      0.586325      0.427283      0.356434     51.011264   
min       24.940000     24.750000     24.440000     24.940000      0.000000   
25%       25.190000     25.190000     24.690000     25.440000      0.000000   
50%       25.380000     25.380000     24.940000     25.750000      0.000000   
75%       25.630000     25.630000     25.380000     26.000000     12.000000   
max       26.380000     29.000000     26.190000     26.560000    165.000000   

          S2_Light      S3_Light      S4_Light      S1_Sound      S2_Sound  \
count  10129.00000  10129.000000  10129.000000  10129.000000  10129.000000   
mean      26.01629     34.248494     13.220259      0.168178      0.120066   
std       67.30417     58.400744  

**Fit a linear regression model to predict room occupancy based on the sensor data. Interpret the summary of the regression model and discuss the significance of the coefficients.**

**Temperature data**

In [6]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp']]

y = Occupancy['Room_Occupancy_Count']

In [7]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.548
Model:                              OLS   Adj. R-squared:                  0.548
Method:                   Least Squares   F-statistic:                     3065.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:45:25   Log-Likelihood:                -9214.4
No. Observations:                 10129   AIC:                         1.844e+04
Df Residuals:                     10124   BIC:                         1.847e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const

In [None]:
# y = B0 + B1x1 + B2x2 + B3x3 + B4x4

# y = -37.4029 + 1.6245*S1_Temp + 0.4826*S2_Temp + 0.1581*S3_Temp - 0.7703*S4_Temp

In [9]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using only temperature data, the R-squared values is 54.8%.

## As the temperature values are highly correlated, the p-values and coefficients are not reliable (multicollinearity).

**Light data**

In [10]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S1_Light', 'S2_Light', 'S3_Light', 'S4_Light']]

y = Occupancy['Room_Occupancy_Count']

In [12]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.792
Model:                              OLS   Adj. R-squared:                  0.792
Method:                   Least Squares   F-statistic:                     9660.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:46:46   Log-Likelihood:                -5270.8
No. Observations:                 10129   AIC:                         1.055e+04
Df Residuals:                     10124   BIC:                         1.059e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const

In [None]:
# y = B0 + B1x1 + B2x2 + B3x3 + B4x4

# y = 0.0410 + 0.0076*S1_Light + 0.0032*S2_Light + 0.0058*S3_Light - 0.0091*S4_Light

In [15]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using only light data, the R-squared values is 79.2%.

## As the light values are highly correlated, the p-values and coefficients are not reliable (multicollinearity).

**Sound data**

In [None]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound']]

y = Occupancy['Room_Occupancy_Count']

In [17]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.444
Model:                              OLS   Adj. R-squared:                  0.444
Method:                   Least Squares   F-statistic:                     2022.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:47:17   Log-Likelihood:                -10259.
No. Observations:                 10129   AIC:                         2.053e+04
Df Residuals:                     10124   BIC:                         2.056e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const

In [None]:
# y = B0 + B1x1 + B2x2 + B3x3 + B4x4

# y = 0.0909 + 0.8700*S1_Sound + 0.9411*S2_Sound + 0.5396*S3_Sound - 0.3559*S4_Sound

In [20]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using only sound data, the R-squared values is 44.4%.

## Since the t-test value of S1_Sound variable is 31.714 with p-value 0.000 which is less than
## 0.05, we can conclude that the coefficient of S1_Sound is not zero. (highly significant)

## Since the t-test value of S2_Sound variable is 28.645 with p-value 0.000 which is less than
## 0.05, we can conclude that the coefficient of S2_Sound is not zero. (highly significant)

## Since the t-test value of S3_Sound variable is 23.145 with p-value 0.000 which is less than
## 0.05, we can conclude that the coefficient of S3_Sound is not zero. (highly significant)

## Since the t-test value of S4_Sound variable is -4.286 with p-value 0.000 which is less than
## 0.05, we can conclude that the coefficient of S4_Sound is not zero. (highly significant)

**CO2 data**

In [21]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S5_CO2', 'S5_CO2_Slope']]

y = Occupancy['Room_Occupancy_Count']

In [22]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.746
Model:                              OLS   Adj. R-squared:                  0.746
Method:                   Least Squares   F-statistic:                 1.485e+04
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:48:27   Log-Likelihood:                -6297.2
No. Observations:                 10129   AIC:                         1.260e+04
Df Residuals:                     10126   BIC:                         1.262e+04
Df Model:                             2                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
c

In [23]:
# y = B0 + B1x1 + B2x2

# y = -0.8794 + 0.0028*S5_CO2 + 0.4281*S5_CO2_Slope

In [24]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using only CO2 data, the R-squared values is 74.6%.

## As the CO2 slope data is derived from the C02 values, the p-values and coefficients are not reliable (multicollinearity).

**CO2 Slope data**

In [25]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S5_CO2_Slope']]

y = Occupancy['Room_Occupancy_Count']

In [26]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.361
Model:                              OLS   Adj. R-squared:                  0.361
Method:                   Least Squares   F-statistic:                     5729.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:49:11   Log-Likelihood:                -10962.
No. Observations:                 10129   AIC:                         2.193e+04
Df Residuals:                     10127   BIC:                         2.194e+04
Df Model:                             1                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
c

In [27]:
# y = B0 + B1x1

# y = 0.4008 + 0.4611*S5_CO2_Slope

This duplicates the results of the Mao et al. study, which found:
- An intercept value of approximately 0.4008, indicating the expected value of Room_Occupancy_Count
when S5_CO2_Slope is zero
- Positive slope of approximately 0.4611, indicating that for each unit increase in S5_CO2_Slope we can
anticipate an increase of approximately 0.4611 in Room_Occupancy_Count 

**Motion data**

In [28]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S6_PIR', 'S6_PIR']]

y = Occupancy['Room_Occupancy_Count']

In [29]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.401
Model:                              OLS   Adj. R-squared:                  0.401
Method:                   Least Squares   F-statistic:                     3386.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:49:41   Log-Likelihood:                -10639.
No. Observations:                 10129   AIC:                         2.128e+04
Df Residuals:                     10126   BIC:                         2.131e+04
Df Model:                             2                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const

In [None]:
# y = B0 + B1x1 + B2x2

# y = 0.2181 + 6.559e+11*S6_PIR - 6.559e+11*S7_PIR

In [31]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using only motion data, the R-squared values is 40.1%.

## As the motion values are highly correlated, the p-values and coefficients are not reliable (multicollinearity).)

Based on the linear regression analysis, light data has the highest R-squared value at 79.2% followed by CO2 data at 74.6%. This is consistent with the results of the
introductory paper, which found that light features performed the best overall and that the CO2 features had promising results.

**All features**

In [32]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp',
               'S1_Light', 'S2_Light', 'S3_Light', 'S4_Light',
               'S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound',
               'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']]

y = Occupancy['Room_Occupancy_Count']

In [33]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.894
Model:                              OLS   Adj. R-squared:                  0.894
Method:                   Least Squares   F-statistic:                     5318.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:50:15   Log-Likelihood:                -1877.0
No. Observations:                 10129   AIC:                             3788.
Df Residuals:                     10112   BIC:                             3911.
Df Model:                            16                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
c

In [34]:
# y = B0 + B1x1 + B2x2 + B3x3 + B4x4 + B5x5 + B6x6 + B7x7 + B8x8 + B9x9 + B10x10 + B11x11 + B12x12 + B13x13 + B14x14 + B15x16 + B16x16  

# y = -8.0051 + 0.1335*S1_Temp + 0.1166*S2_Temp + 0.7861*S3_Temp - 0.6959*S4_Temp + 0.0054*S1_Light + 0.0009*S2_Light + 0.0026*S3_Light -
# 0.0040*S4_Light + 0.1009*S1_Sound + 0.1957*S2_Sound - 0.0763*S3_Sound - 0.3485*S4_Sound + 3.42e-05*S5_CO2 + 0.1896*S5_CO2_Slope +
# 0.1800*S6_PIR + 0.4139*S7_PIR

In [35]:
# The R-squared value represents the proportion of variance in the dependent variable that is explained by the independent variables.
# Using all sensor data, the R-squared value is 89.4%.

As expected, using all sensor data provided the best model fit, with an R-squared value of 89.4%.

The results suggest that the S5_CO2 value can be dropped since it's p-value is greater than 0.05.

**All features except S5_CO2**

In [36]:
# Separate independent variables (features) and dependent variable (target)
X = Occupancy[['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp',
               'S1_Light', 'S2_Light', 'S3_Light', 'S4_Light',
               'S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound',
               'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']]

y = Occupancy['Room_Occupancy_Count']

In [37]:
# Add constant term for intercept
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()       # Ordinary Least Squares regression mode

# Print the summary of the regression model
print("\nRegression Model Summary:")

print(model.summary())


Regression Model Summary:
                             OLS Regression Results                             
Dep. Variable:     Room_Occupancy_Count   R-squared:                       0.894
Model:                              OLS   Adj. R-squared:                  0.894
Method:                   Least Squares   F-statistic:                     5672.
Date:                  Mon, 18 Nov 2024   Prob (F-statistic):               0.00
Time:                          08:55:29   Log-Likelihood:                -1877.4
No. Observations:                 10129   AIC:                             3787.
Df Residuals:                     10113   BIC:                             3902.
Df Model:                            15                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
c