In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as datetime

In [5]:
# Load the .npz file
loaded_data = np.load('../datasets/windows.npz', allow_pickle=True)

# Extract the column names
column_names = loaded_data['column_names']

# Convert the loaded data back to a dictionary of lists of DataFrames, using the column names
windows_df = {label: [pd.DataFrame(array, columns=column_names) for array in arrays_list] 
              for label, arrays_list in loaded_data.items() if label != 'column_names'}

# Loop through windows_df and set 'Datetime' as the index and drop unwanted columns
for label, windows_list in windows_df.items():
    for i, window in enumerate(windows_list):
        # Convert 'Datetime' to a datetime object
        window['Datetime'] = pd.to_datetime(window['Datetime'])

        # Set 'Datetime' as the index
        window.set_index('Datetime', inplace=True)

        # Drop 'rhumid' and 'atmpr' columns
        # window.drop(['rhumid', 'atmpr'], axis=1, inplace=True)

        # Assign the modified window back to the list
        windows_df[label][i] = window

        
windows_df['Song'][0].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 76 entries, 2024-04-02 12:15:00 to 2024-04-02 13:30:00
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   co2            76 non-null     object
 1   tempF          76 non-null     object
 2   rhumid         76 non-null     object
 3   atmpr          76 non-null     object
 4   door1          76 non-null     object
 5   door2          76 non-null     object
 6   hvac           76 non-null     object
 7   subject_count  76 non-null     object
 8   lecturer       76 non-null     object
dtypes: object(9)
memory usage: 5.9+ KB


In [6]:
windows_df['Song'][0].head()

Unnamed: 0_level_0,co2,tempF,rhumid,atmpr,door1,door2,hvac,subject_count,lecturer
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-04-02 12:15:00,460.0,84.7,50.0,1009.2,1,1,0,3.0,1
2024-04-02 12:16:00,437.0,84.9,50.0,1009.1,1,1,0,3.0,1
2024-04-02 12:17:00,452.0,85.0,50.0,1009.1,1,1,0,3.0,1
2024-04-02 12:18:00,447.0,85.1,48.0,1009.0,1,1,0,3.0,1
2024-04-02 12:19:00,452.0,85.1,49.0,1008.9,1,1,0,3.0,1


- For `door1` and `door2` we have `1 = open` and `0 = closed`.
- For `hvac` we have `1 = on` and `0 = off`
- For `lecturer` we have `0 = Chen` and `1 = Song`.

In [9]:

# Concatenate all the DataFrames from each label into a single DataFrame
all_windows = pd.concat([window for sublist in windows_df.values() for window in sublist])

# Sort the DataFrame by the Datetime index
lectures_df = all_windows.sort_index()

# Display the information about the concatenated DataFrame
lectures_df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2052 entries, 2024-03-14 08:00:00 to 2024-04-17 13:30:00
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   co2            2052 non-null   object
 1   tempF          2052 non-null   object
 2   rhumid         2052 non-null   object
 3   atmpr          2052 non-null   object
 4   door1          2052 non-null   object
 5   door2          2052 non-null   object
 6   hvac           2052 non-null   object
 7   subject_count  2052 non-null   object
 8   lecturer       2052 non-null   object
dtypes: object(9)
memory usage: 160.3+ KB


# Questions

1. What is the distribution of our above data types across lecturer (corresponding to morning, `Chen`, versus the noon, `Song`, lectures)
2. What is the max subject count across lectures? What does this say about attendance on observed days? (minus 1 for the lecturer)
3. How many lectures saw no change in state for observed features such as `door`, `hvac`, `subject_count`.
4. How long do we observe certain states such as doors being both open/closed? Hvac being on/off? When the hvac is on, for how long does it remain in said state on average?
5. Plot `co2` versus `tempF`, `relH`, `atmpr` and try labeling the points as other categories. (Note we might need to normalize/standardize our observations.)
    - Do we find any patterns that might be worth exploring with algorithms such as `KNN`?