## **CIND 820 - Big Data Analytics Project**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import svm, metrics 
from sklearn.model_selection import train_test_split # to apply train-test split
from sklearn.model_selection import cross_val_score, KFold # to apply k-fold cross-validation 
from sklearn.metrics import confusion_matrix, classification_report

**Create a data frame of the Room Occupancy dataset. Display the names of the columns. Check the data types of the attributes. Run a summary statistics of the dataset.**

In [2]:
# Read the CSV file into a data frame called Occupancy
Occupancy = pd.read_csv('Occupancy_Estimation.csv')

# Display the first five records of the data frame
Occupancy.head()

Unnamed: 0,Date,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
0,2017/12/22,10:49:41,24.94,24.75,24.56,25.38,121,34,53,40,0.08,0.19,0.06,0.06,390,0.769231,0,0,1
1,2017/12/22,10:50:12,24.94,24.75,24.56,25.44,121,33,53,40,0.93,0.05,0.06,0.06,390,0.646154,0,0,1
2,2017/12/22,10:50:42,25.0,24.75,24.5,25.44,121,34,53,40,0.43,0.11,0.08,0.06,390,0.519231,0,0,1
3,2017/12/22,10:51:13,25.0,24.75,24.56,25.44,121,34,53,40,0.41,0.1,0.1,0.09,390,0.388462,0,0,1
4,2017/12/22,10:51:44,25.0,24.75,24.56,25.44,121,34,54,40,0.18,0.06,0.06,0.06,390,0.253846,0,0,1


In [3]:
# Get the column names
print("Column Names:", Occupancy.columns)

Column Names: Index(['Date', 'Time', 'S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp', 'S1_Light',
       'S2_Light', 'S3_Light', 'S4_Light', 'S1_Sound', 'S2_Sound', 'S3_Sound',
       'S4_Sound', 'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR',
       'Room_Occupancy_Count'],
      dtype='object')


In [4]:
# Get the data types of each column
print("\nData Types of Each Column:")
print(Occupancy.dtypes)


Data Types of Each Column:
Date                     object
Time                     object
S1_Temp                 float64
S2_Temp                 float64
S3_Temp                 float64
S4_Temp                 float64
S1_Light                  int64
S2_Light                  int64
S3_Light                  int64
S4_Light                  int64
S1_Sound                float64
S2_Sound                float64
S3_Sound                float64
S4_Sound                float64
S5_CO2                    int64
S5_CO2_Slope            float64
S6_PIR                    int64
S7_PIR                    int64
Room_Occupancy_Count      int64
dtype: object


In [5]:
# Get the summary statistics
print("Summary Statistics:")
print(Occupancy.describe())

Summary Statistics:
            S1_Temp       S2_Temp       S3_Temp       S4_Temp      S1_Light  \
count  10129.000000  10129.000000  10129.000000  10129.000000  10129.000000   
mean      25.454012     25.546059     25.056621     25.754125     25.445059   
std        0.351351      0.586325      0.427283      0.356434     51.011264   
min       24.940000     24.750000     24.440000     24.940000      0.000000   
25%       25.190000     25.190000     24.690000     25.440000      0.000000   
50%       25.380000     25.380000     24.940000     25.750000      0.000000   
75%       25.630000     25.630000     25.380000     26.000000     12.000000   
max       26.380000     29.000000     26.190000     26.560000    165.000000   

          S2_Light      S3_Light      S4_Light      S1_Sound      S2_Sound  \
count  10129.00000  10129.000000  10129.000000  10129.000000  10129.000000   
mean      26.01629     34.248494     13.220259      0.168178      0.120066   
std       67.30417     58.400744  

**Scale the numeric features**

In [7]:
# Create the scaler
scaler = StandardScaler()

# Transform the numeric data
X = pd.DataFrame(scaler.fit_transform(Occupancy.iloc[:, 2:16]))

# Add the binary motion data
X['S6_PIR'] = Occupancy['S6_PIR']
X['S7_PIR'] = Occupancy['S7_PIR']

# Add the column names for the features
X.columns = ['S1_Temp', 'S2_Temp', 'S3_Temp', 'S4_Temp',
             'S1_Light', 'S2_Light', 'S3_Light', 'S4_Light',
             'S1_Sound', 'S2_Sound', 'S3_Sound', 'S4_Sound',
             'S5_CO2', 'S5_CO2_Slope', 'S6_PIR', 'S7_PIR']

# Define the target variable
y = Occupancy['Room_Occupancy_Count']

In [38]:
scaled_df_summary = pd.DataFrame(X).describe()
scaled_df_summary

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
count,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0
mean,-3.434512e-15,5.53338e-15,4.781379e-15,-6.779233e-15,8.979116000000001e-17,-1.1223900000000001e-17,-4.4895580000000007e-17,-1.010151e-16,-2.2447790000000003e-17,-5.611948000000001e-17,-8.417922000000001e-17,-1.09433e-16,-8.979116000000001e-17,-3.0865710000000005e-17,0.090137,0.079574
std,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,1.000049,0.286392,0.270645
min,-1.463033,-1.357775,-1.443193,-2.284195,-0.4988372,-0.386567,-0.5864683,-0.6744599,-0.3415843,-0.3004478,-0.285577,-0.446154,-0.5794322,-5.400593,0.0,0.0
25%,-0.7514583,-0.6073017,-0.8580709,-0.8813419,-0.4988372,-0.386567,-0.5864683,-0.6744599,-0.310008,-0.2629229,-0.237223,-0.3632881,-0.5294209,-0.03547318,0.0,0.0
50%,-0.2106611,-0.2832335,-0.272949,-0.01157295,-0.4988372,-0.386567,-0.5864683,-0.6744599,-0.2784317,-0.2629229,-0.237223,-0.1975562,-0.5044153,0.004146165,0.0,0.0
75%,0.5009141,0.1431719,0.7568655,0.6898537,-0.2635834,-0.1785459,0.2697274,0.4479174,-0.2784317,-0.2253979,-0.213046,-0.03182432,0.02070265,0.004146165,0.0,0.0
max,2.63564,5.891117,2.65266,2.261049,2.735902,3.446966,4.208228,3.100809,11.72055,12.45804,8.490675,27.31394,4.046607,7.71341,1.0,1.0


**Split the data into training and testing sets**

In [8]:
# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
X_train[0:5]

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
8582,0.301673,-0.078559,0.452602,-0.348258,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.262923,-0.237223,-0.11469,0.120725,-1.273578,0,0
1166,0.671692,0.245509,0.593031,0.353169,-0.498837,-0.386567,-0.586468,-0.67446,-0.278432,-0.262923,-0.2614,-0.363288,0.795877,-1.83155,0,0
254,1.554045,3.656752,1.763275,1.222938,2.696693,3.357814,3.985617,2.896741,-0.278432,0.037277,-0.067984,-0.280422,2.096169,1.073868,0,0
415,1.554045,0.995983,1.763275,1.559623,-0.185165,-0.089394,1.006056,2.233518,-0.310008,-0.262923,-0.237223,-0.363288,1.471028,-3.14229,0,0
3334,1.724823,2.155805,1.178153,1.559623,1.814492,-0.059677,-0.021379,0.498935,0.037331,-0.262923,-0.213046,-0.280422,0.070714,0.786628,0,0


**Use SVM for modeling**

In [24]:
# Create a SVM classifier
model = svm.SVC(kernel='rbf') # RBF Kernel

**Apply cross validation on the training set**

In [25]:
# Prepare the cross-validation procedure
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=1)

In [26]:
# Apply cross-validation on the training set
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=kf, n_jobs=-1)

In [27]:
# Print accuracy
print('Accuracy: %.3f ,\nStandard Deviations :%.3f' %
      (np.mean(scores), np.std(scores)))

Accuracy: 0.994 ,
Standard Deviations :0.003


In [225]:
# Evaluation metrics
print(f'Cross-Validation Results (Accuracy): {scores}')
print(f'Mean Accuracy: {scores.mean()}')

Cross-Validation Results (Accuracy): [0.99630086 0.99506782 0.99383477 0.98765432 0.9962963  0.99259259
 0.99012346 0.9962963  0.98888889 0.9962963 ]
Mean Accuracy: 0.9933351600675892


**Train the model on the full training set and evaluate on the test set** 

In [226]:
# Train the model using the training sets
model.fit(X_train, y_train)

In [227]:
# Predict the response for test dataset
y_pred = model.predict(X_test)
print(y_pred[:10])

[0 0 0 2 0 0 0 0 2 3]


In [228]:
# Calculate the performance metrics for our logistic regression machine learning model
class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1679
           1       1.00      0.99      0.99        81
           2       0.98      0.98      0.98       127
           3       0.97      0.98      0.97       139

    accuracy                           1.00      2026
   macro avg       0.99      0.99      0.99      2026
weighted avg       1.00      1.00      1.00      2026



In [216]:
# Create the confusion matrix for the model
cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)

[[1679    0    0    0]
 [   0   80    0    1]
 [   0    0  124    3]
 [   0    0    3  136]]
