# Tidy Data 10/20/22

Goal: Put data into "tidy" (long) format

Timestamp | Room | Window Open | Temperature | RH 

In [1]:
import json
import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import math 
import sys
import os

# overall directory location 
root  = "/Users/julietnwagwuume-ezeoke/Library/CloudStorage/GoogleDrive-jnwagwu@stanford.edu/My Drive/UIL/windows/"
# add path to scripts 
sys.path.insert(0, os.path.join(root, 'analysis/scripts'))

In [2]:
from temprh_analysis import Temp_RH_Analysis

In [80]:
# TODO add these fx to temprh_analysis.py 

def str2dt(date):
    """Convert a string to a pandas timestamp object """
    return pd.to_datetime(date, format= '%Y, %m, %d, %H, %M' )

def get_htimes(df, date):
    with open('../constants/htimes.json') as f:    
        htimes = json.load(f)
    htimes_arr = [(str2dt(i["open"]), str2dt(i["close"])) for i in  htimes[date]]

    return htimes_arr

# Single DataFrame

In [3]:
a = Temp_RH_Analysis(root, "072522", ["Open", "Sometimes Open"] )
# TODO make file in constants with all times and variables 

In [5]:
a.trh_422a

Unnamed: 0,DateTime,Temp C,RH %
0,2022-07-18 22:30:00,26.090,50.099
1,2022-07-18 22:30:30,26.066,50.127
2,2022-07-18 22:31:00,26.042,50.571
3,2022-07-18 22:31:30,26.017,50.185
4,2022-07-18 22:32:00,25.993,50.309
...,...,...,...
18348,2022-07-25 07:24:00,22.901,53.616
18349,2022-07-25 07:24:30,22.925,53.557
18350,2022-07-25 07:25:00,22.901,53.491
18351,2022-07-25 07:25:30,22.925,53.557


* based on this, would just need to add a column about whether the window was open or closed based on h_times, and then group into one data set 

In [13]:
htimes = get_htimes(a, a.date)
htimes

[(Timestamp('2022-07-19 12:01:00'), Timestamp('2022-07-19 20:00:00')),
 (Timestamp('2022-07-20 07:50:00'), Timestamp('2022-07-20 12:30:00')),
 (Timestamp('2022-07-20 22:28:00'), Timestamp('2022-07-21 13:11:00')),
 (Timestamp('2022-07-21 15:06:00'), Timestamp('2022-07-21 17:28:00')),
 (Timestamp('2022-07-21 22:45:00'), Timestamp('2022-07-22 07:50:00')),
 (Timestamp('2022-07-22 11:07:00'), Timestamp('2022-07-22 16:00:00')),
 (Timestamp('2022-07-22 23:02:00'), Timestamp('2022-07-23 19:50:00')),
 (Timestamp('2022-07-24 13:22:00'), Timestamp('2022-07-24 19:33:00'))]

In [31]:
# create a column window open, with all 0 values (0 indicating closed)
df = a.trh_422a
df["Window Open"] = 0
# at these values in htime, change to 1 (indicating window open)
for htime_pair in htimes:
    mask = (df['DateTime'] > htime_pair [0]) & (df['DateTime'] <= htime_pair [1])
    df.loc[mask, "Window Open"]= 1

In [None]:
def add_window_open_values(df, date):
    htimes = get_htimes(df, date)
    df["Window Open"] = 0
    # at these values in htime, change to 1 (indicating window open)
    for htime_pair in htimes:
        mask = (df['DateTime'] > htime_pair [0]) & (df['DateTime'] <= htime_pair [1])
        df.loc[mask, "Window Open"]= 1

In [34]:
df["Window Open"].mean()

0.462158775132131

# All DataFrames

In [46]:
def add_window_open_values(df, date):
    htimes = get_htimes(df, date)
    df["Window Open"] = 0
    # at these values in htime, change to 1 (indicating window open)
    for htime_pair in htimes:
        mask = (df['DateTime'] > htime_pair [0]) & (df['DateTime'] <= htime_pair [1])
        df.loc[mask, "Window Open"]= 1
    return df

In [42]:
# list of time stamps. for all time stamps, repeat for a and b. add column indicating if room a or room b 

with open('../constants/htimes.json') as f:    
    htimes_file = json.load(f)

In [44]:
htimes_dates = htimes.keys()

In [75]:
# initialize total  dataframe

all_data_list = []

for date in htimes_dates:
    # get data for this  date
    date_data = Temp_RH_Analysis(root, date, ["Open", "Sometimes Open"] )
    
    # Room A -> 0
    df_a  = add_window_open_values(date_data.trh_422a, date)
    df_a["Room"] = 0

    # Room B -> 1
    df_b = add_window_open_values(date_data.trh_422b, date)
    df_b["Room"] = 1

    # join these dataframes df_a + df_b for specific date
    all_data_list.extend([df_a, df_b])


    
    

In [77]:
# add to total df  
all_data_df = pd.concat(all_data_list)

In [78]:
all_data_df

Unnamed: 0,DateTime,Temp C,RH %,Window Open,Room
0,2022-07-18 22:30:00,26.090,50.099,0,0
1,2022-07-18 22:30:30,26.066,50.127,0,0
2,2022-07-18 22:31:00,26.042,50.571,0,0
3,2022-07-18 22:31:30,26.017,50.185,0,0
4,2022-07-18 22:32:00,25.993,50.309,0,0
...,...,...,...,...,...
34912,2022-09-20 10:41:00,25.264,51.358,0,1
34913,2022-09-20 10:41:30,25.264,51.358,0,1
34914,2022-09-20 10:42:00,25.264,51.358,0,1
34915,2022-09-20 10:42:30,25.385,52.389,0,1


In [79]:
all_data_df["Room"].mean() # should be 0.5 -> same amount of data collected in Rooms A and B 

0.5000209233473171

## save as csv

In [None]:
all_data_df.to_csv("../constants/tidydata_102022.csv")