# Exploratory Analysis 09/20/22

In [1]:
# goals 
""" 
bring in data sets as data frames
based on htimes json file, split into 2 dataframes as discussed in the 9/20 note 
"""

' \nbring in data sets as data frames\nbased on htimes json file, split into 2 dataframes as discussed in the 9/20 note \n'

In [8]:
import json
import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import math 
import sys
import os

# overall directory location 
root  = "/Users/julietnwagwuume-ezeoke/Library/CloudStorage/GoogleDrive-jnwagwu@stanford.edu/My Drive/UIL/windows/"
# add path to scripts 
sys.path.insert(0, os.path.join(root, 'analysis/scripts'))

In [3]:
# quiet np warning 
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)  

In [9]:
from temprh_analysis import Temp_RH_Analysis


# Functions

In [13]:
def str2dt(date):
    """Convert a string to a pandas timestamp object """
    return pd.to_datetime(date, format= '%Y, %m, %d, %H, %M' )

In [14]:
# make dictionary of open to close times 
def get_htimes(df, date):
    with open('../constants/htimes.json') as f:    
        htimes = json.load(f)
    htimes_arr = [(str2dt(i["open"]), str2dt(i["close"])) for i in  htimes[date]]

    return htimes_arr

# split df into groups based on the open to close time (open temps, open rh)

In [15]:
def get_all_hdata(datasets):
    """
    datasets: list of datasets returned by the temprh analysis class
    returns 
        final data: list of dictionaries for rooms a and b with all the datasets (opening times, temp, and rh) collated
    # TODO make this work for closing times! -> should have 6 entries for each room, right now, have 3
    """
    # make a dictionary that will contain all the data (for this dataset to start)

    # perform operations for rooms 422 a and b 
    rooms = ["room_a", "room_b"]
    final_data = {}

    for room in rooms: 
        # initialize dictionary for a given room 
        dict_n = {}
        dict_n["open_times"] = np.empty((0,2))
        dict_n["open_temps"] = np.empty((0))
        dict_n["open_rh"] = np.empty((0))

        # loop over all the datasets of interest 
        for dataset in datasets:
            # determine what room this is, and get the corresponding dataframe 
            if room == "room_a":
                df = dataset.trh_422a
            else:
                df = dataset.trh_422b

            # get the opening times 
            open_time_groups = get_htimes(df, dataset.date)

            # for each item in open_times, get the temp and the rh, and add to list of groups 
            open_temp_groups = []
            open_rh_groups = []

            for i in open_time_groups: 
                start, end = i
                sub_df_i = df[(df['DateTime'] >= start) & (df['DateTime'] <= end)]
                open_temp_groups.append(sub_df_i["Temp C"].reset_index(drop=True))
                open_rh_groups.append(sub_df_i["RH %"].reset_index())

                
            # add these groups to the dictionary
            dict_n["open_times"] = np.concatenate([dict_n["open_times"], open_time_groups])
            dict_n["open_temps"] = np.append(dict_n["open_temps"],open_temp_groups, axis=0)
            dict_n["open_rh"] = np.append(dict_n["open_rh"],open_rh_groups, axis=0)

            final_data[room] = dict_n

    # structure
        # final_data = {
        # a : {
        #   open_times[(t1, t2, t3 ...)], 
        #   open_temps[[T1, T2, T3...], [T1, T2, T3...], ... [T1, T2, T3...]] , 
        #   open_rh[[], [], ... []]
        # }
        # b : {open_times, open_temps, open_rh}
        # }

    return final_data

# Import Data

In [16]:
# read in all datasets as dataframes
# [422a, 422b]
a = Temp_RH_Analysis(root, "072522", ["Open", "Sometimes Open"] )
b = Temp_RH_Analysis(root, "081622", ["Sometimes Closed", "Closed"])
# c = Temp_RH_Analysis(root, "082022", ["Open", "Closed"])
# d = Temp_RH_Analysis(root, "092022", ["Sometimes Closed", "Closed"])

In [17]:
# determine which datasets to include 
datasets = [a,b]
all_data = get_all_hdata(datasets) 
# all_data


In [None]:
# a_data["open_times"] = get_htimes(a.trh_422a, a.date)
# a_data["open_times"]

In [19]:
all_data

{'room_a': {'open_times': array([[Timestamp('2022-07-19 12:01:00'),
          Timestamp('2022-07-19 20:00:00')],
         [Timestamp('2022-07-20 07:50:00'),
          Timestamp('2022-07-20 12:30:00')],
         [Timestamp('2022-07-20 22:28:00'),
          Timestamp('2022-07-21 13:11:00')],
         [Timestamp('2022-07-21 15:06:00'),
          Timestamp('2022-07-21 17:28:00')],
         [Timestamp('2022-07-21 22:45:00'),
          Timestamp('2022-07-22 07:50:00')],
         [Timestamp('2022-07-22 11:07:00'),
          Timestamp('2022-07-22 16:00:00')],
         [Timestamp('2022-07-22 23:02:00'),
          Timestamp('2022-07-23 19:50:00')],
         [Timestamp('2022-07-24 13:22:00'),
          Timestamp('2022-07-24 19:33:00')],
         [Timestamp('2022-07-27 09:00:00'),
          Timestamp('2022-08-01 19:30:00')],
         [Timestamp('2022-08-01 20:13:00'),
          Timestamp('2022-08-02 12:25:00')],
         [Timestamp('2022-08-02 18:13:00'),
          Timestamp('2022-08-10 07:52:00')

# Analyze Data

In [None]:
# greatsest rmse between rooms a and b..
# how to account for uncertainty between the rooms? 
# does not include "closed"* data yet 
# *whether or not this is actually closed data depends on the dataset!


In [190]:
def calc_rmse(arr1, arr2):
    MSE = mean_squared_error(arr1, arr2)   
    RMSE = math.sqrt(MSE) 
    return RMSE

In [191]:
open_rmses = [calc_rmse(a,b) for a,b in zip(all_data["room_a"]["open_temps"], all_data["room_b"]["open_temps"])]
open_rmses
# now we have complication bc need to distinguish between wether room a was open or closed, which changed from dataset to dataset :( 
    # basically, need the times to be ordered acordong to the dataset they came from 

[0.2859888047715159,
 0.12079498992101366,
 0.718796865847375,
 0.37273292258525603,
 1.346876409788489,
 0.2523493765327288,
 0.8491970746569534,
 0.22237580501058324,
 0.785460798544028,
 0.4758808371937521,
 0.7022101741470479,
 0.4567469887855917,
 0.29199999999999804]

In [192]:
# sort from greatest to smallest 
np.sort(open_rmses)

array([0.12079499, 0.22237581, 0.25234938, 0.2859888 , 0.292     ,
       0.37273292, 0.45674699, 0.47588084, 0.70221017, 0.71879687,
       0.7854608 , 0.84919707, 1.34687641])

In [196]:
idx_srmse = np.argsort(open_rmses)

In [197]:
all_data["room_a"]["open_times"][idx_srmse]
# can categorize after the fact based on 
    # which room was open / closed at all times, ie the control 
    # times of day encompassed...

array([[Timestamp('2022-07-20 07:50:00'),
        Timestamp('2022-07-20 12:30:00')],
       [Timestamp('2022-07-24 13:22:00'),
        Timestamp('2022-07-24 19:33:00')],
       [Timestamp('2022-07-22 11:07:00'),
        Timestamp('2022-07-22 16:00:00')],
       [Timestamp('2022-07-19 12:01:00'),
        Timestamp('2022-07-19 20:00:00')],
       [Timestamp('2022-08-10 17:24:00'),
        Timestamp('2022-08-10 17:24:00')],
       [Timestamp('2022-07-21 15:06:00'),
        Timestamp('2022-07-21 17:28:00')],
       [Timestamp('2022-08-10 11:58:00'),
        Timestamp('2022-08-10 12:40:00')],
       [Timestamp('2022-08-01 20:13:00'),
        Timestamp('2022-08-02 12:25:00')],
       [Timestamp('2022-08-02 18:13:00'),
        Timestamp('2022-08-10 07:52:00')],
       [Timestamp('2022-07-20 22:28:00'),
        Timestamp('2022-07-21 13:11:00')],
       [Timestamp('2022-07-27 09:00:00'),
        Timestamp('2022-08-01 19:30:00')],
       [Timestamp('2022-07-22 23:02:00'),
        Timestamp('2022