In [61]:
import pandas as pd
import numpy as np
import glob as glob
import re
from functools import reduce

In [70]:
# Available names = may, june, july
def get_dir_name(month, year=2020):
    return f"data/{month}_{year}/"

def get_dfs(data_dir):
    csv_files = glob.glob(data_dir + 'Room*.csv')

    def get_room_number(filename):
        pattern = r'Room (\d{3})'
        match = re.search(pattern, filename)
        if match:
            number = match.group(1)
            return number

    print(csv_files)
    room_numbers = []
    for filename in csv_files:
        room_numbers.append(get_room_number(filename))
    print(room_numbers)
    print("--- Dataframe processing ---")
    dataframes = dict()
    for file in csv_files:
        df = pd.read_csv(file)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df.set_index('Timestamp', inplace=True, drop=False)
        dataframes[get_room_number(file)] = df
    return dataframes

In [71]:
meta_data = pd.read_csv("data/may_2020/meta.csv")
print(meta_data)

dfs = get_dfs(get_dir_name("may"))

# Export these results into a file and prepare for import from another system
# GNN - End Result: Rooms x Temperature x Timestamps
# INN - Features : Rooms x Features x Timestamps

  building_name           start_time             end_time  total_rooms   
0        NAE-01  2020-05-01 00:00:00  2020-05-31 00:00:00           81  \

  features  line_limit  
0        h          10  
['data/may_2020\\Room 104.csv', 'data/may_2020\\Room 105.csv', 'data/may_2020\\Room 107.csv', 'data/may_2020\\Room 108.csv', 'data/may_2020\\Room 110.csv', 'data/may_2020\\Room 114.csv', 'data/may_2020\\Room 120.csv', 'data/may_2020\\Room 121.csv', 'data/may_2020\\Room 122.csv', 'data/may_2020\\Room 208.csv', 'data/may_2020\\Room 213.csv', 'data/may_2020\\Room 216.csv', 'data/may_2020\\Room 217.csv', 'data/may_2020\\Room 221.csv', 'data/may_2020\\Room 223.csv', 'data/may_2020\\Room 227.csv', 'data/may_2020\\Room 229.csv', 'data/may_2020\\Room 240.csv', 'data/may_2020\\Room 245.csv', 'data/may_2020\\Room 247.csv', 'data/may_2020\\Room 248.csv', 'data/may_2020\\Room 249.csv', 'data/may_2020\\Room 252.csv', 'data/may_2020\\Room 254.csv', 'data/may_2020\\Room 261.csv', 'data/may_2020\\Room 264.

In [84]:
def filter_dfs(dfs_list, features, resample_interval='30T'):
    def resample_clearNan(df, resample_interval):
        return df.dropna().resample(resample_interval).mean()
    return [resample_clearNan(x, resample_interval)[features] for x in dfs_list]
rooms_list, dfs_list = zip(*dfs.items())
temps_x_time_dfs_list = filter_dfs(dfs_list, ['Zone Temperature'])
merged_df = temps_x_time_dfs_list[0]
merged_df = merged_df.rename(columns={'Zone Temperature':f'Zone Temperature_{rooms_list[0]}'})
for i in range(1, len(temps_x_time_dfs_list)):
    merged_df = pd.merge(merged_df, temps_x_time_dfs_list[i], left_index=True, right_index=True, suffixes=(f"_{rooms_list[i-1]}", f"_{rooms_list[i]}"))
merged_df

Unnamed: 0_level_0,Zone Temperature_104,Zone Temperature_105,Zone Temperature_107,Zone Temperature_108,Zone Temperature_110,Zone Temperature_114,Zone Temperature_120,Zone Temperature_121,Zone Temperature_122,Zone Temperature_208,...,Zone Temperature_448,Zone Temperature_450,Zone Temperature_453,Zone Temperature_461,Zone Temperature_462,Zone Temperature_463,Zone Temperature_469,Zone Temperature_470,Zone Temperature_484,Zone Temperature_490
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-03 05:30:00,75.906916,71.458824,71.458824,71.458824,73.776520,73.776520,70.597481,72.418427,72.767639,73.328979,...,75.234554,74.019829,73.263008,74.998314,74.998314,75.995506,76.241470,77.127472,71.511894,71.511894
2020-05-03 06:00:00,75.525536,71.458824,71.458824,71.458824,73.776520,73.776520,70.597481,72.418427,72.767639,72.827950,...,75.106911,74.019829,73.263008,74.998314,74.998314,75.995506,75.739075,77.127472,71.511894,71.511894
2020-05-03 06:30:00,75.525536,71.458824,71.458824,71.458824,73.776520,73.776520,70.597481,72.418427,72.767639,72.827950,...,75.106911,73.767708,73.263008,74.494423,74.494423,75.995506,75.739075,77.127472,71.511894,71.511894
2020-05-03 07:00:00,75.358177,71.207047,71.207047,71.207047,73.776520,73.776520,70.597481,72.418427,72.767639,72.827950,...,74.606201,73.515587,73.178130,74.494423,74.494423,75.995506,75.739075,77.127472,71.511894,71.511894
2020-05-03 07:30:00,75.023460,70.955269,70.955269,70.955269,73.776520,73.776520,70.597481,72.418427,72.767639,72.827950,...,74.606201,73.515587,72.753738,74.494423,74.494423,75.995506,75.739075,77.127472,71.511894,71.511894
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-30 21:30:00,74.553978,71.264083,71.264083,71.264083,73.673279,73.673279,67.506889,71.805771,71.610802,71.270607,...,72.647141,73.546379,71.377869,74.963806,74.963806,76.133278,75.984581,76.020363,72.939209,72.939209
2020-05-30 22:00:00,75.061534,71.772278,71.772278,71.772278,73.673279,73.673279,67.506889,71.805771,71.610802,71.270607,...,72.647141,73.546379,71.377869,74.963806,74.963806,76.133278,75.984581,76.187653,72.939209,72.939209
2020-05-30 22:30:00,75.647578,71.772278,71.772278,71.772278,73.921323,73.921323,67.506889,71.805771,71.610802,71.270607,...,73.148239,73.965506,71.544668,74.963806,74.963806,76.551992,76.488388,76.522232,72.939209,72.939209
2020-05-30 23:00:00,76.235500,71.772278,71.772278,71.772278,73.970932,73.970932,67.506889,71.805771,71.610802,71.270607,...,73.148239,74.049332,71.878265,74.963806,74.963806,76.635735,76.488388,76.605932,72.939209,72.939209


In [137]:
# filter_dfs(dfs_list, ['Zone Temperature'])
list_of_features = dfs_list[0].columns.drop('S.No')
print(list_of_features)
features_rooms = filter_dfs(dfs_list, list_of_features)
# print(features_rooms[0]['Timestamp'][0])

def series_convert_time_to_num(timestamp):
    return timestamp.dt.hour + timestamp.dt.minute/60 + timestamp.dt.second/3600

def away_from_time(num, time=2):
    assert time <= 12 and time >= 0
    if num > 12 + time:
        num = 24 - num + time
    else:
        num = np.abs(num - time)
    return num
# 00:00 | 2
# 02:00 | 0
# 08:00 | 6
# 12:00 | 12
# 24:00 | 2
# 20:00 | 6

for room in features_rooms:
    room['Timestamp'] = series_convert_time_to_num(room['Timestamp']).apply(lambda x: away_from_time(x, 2))
 

features_rooms[0].columns
# rooms_list
# print(len(rooms_list), len(features_rooms))
df_temperatures_y = list()
for room in features_rooms:
    df_temperatures_y.append(room['Zone Temperature'].drop(room.index[0]))

features_rooms_dropped = list()
for room in features_rooms:
    room = room.drop(room.index[-1])
    features_rooms_dropped.append(room)



Index(['Timestamp', 'Zone Temperature', 'Warm/Cool Adjust', 'Supply Vol Press',
       'Actual Cooling Setpt', 'Actual Heating Setpt',
       'Actual Damper Position', 'Actual Supply Flow', 'Reheat Valve Command',
       'Common Setpoint', 'Damper Command', 'Actual Sup Flow SP',
       'Cooling Max Flow', 'Occupied Clg Min', 'Occupied Htg Flow',
       'Cooling Command', 'Heating Command', 'Occupied Command',
       'Occupied Status'],
      dtype='object')


In [138]:
len(df_temperatures_y)
np.array(df_temperatures_y).shape

(81, 1332)

In [139]:

array_features_rooms = np.array(features_rooms_dropped)
array_features_temperature_y = np.array(df_temperatures_y)
array_room = np.array(rooms_list)
array_temps = np.array(merged_df)
np.save("preprocessing_output/merged_rooms_list.npy", array_room)
np.save("preprocessing_output/merged_features_rooms.npy", array_features_rooms)
np.save("preprocessing_output/merged_temps_time.npy", array_temps)

In [140]:
print(array_temps.shape, array_room.shape, array_features_rooms.shape, array_features_temperature_y.shape)
# print(array_df)
# print(len(dfs_list[0].columns))
# array_features_rooms[0,:,0].strftime('%H:%M:%S')

(1333, 81) (81,) (81, 1332, 19) (81, 1332)
