In [1]:
import pandas as pd
import numpy as np
import glob as glob
import os
import fnmatch
import re
from sklearn.model_selection import train_test_split

In [2]:
# Available names = may, june, july
def get_dir_name(month, year=2020):
    return f"data/{month}_{year}/"

def get_dfs(data_dir):
    csv_files = glob.glob(data_dir + 'Room*.csv')

    def get_room_number(filename):
        pattern = r'Room (\d{3})'
        match = re.search(pattern, filename)
        if match:
            number = match.group(1)
            return number
    exclude_pattern = "*271*"
    filtered_csv_files = [file for file in csv_files if not fnmatch.fnmatch(file, exclude_pattern)]
    print(filtered_csv_files)
    room_numbers = []
    for filename in filtered_csv_files:
        room_numbers.append(get_room_number(filename))
    print(room_numbers)
    print("--- Dataframe processing ---")
    dataframes = dict()
    for file in filtered_csv_files:
        df = pd.read_csv(file)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df.set_index('Timestamp', inplace=True, drop=False)
        dataframes[get_room_number(file)] = df
    return dataframes

In [3]:
meta_data = pd.read_csv("data/may_2020/meta.csv")
print(meta_data)

may_df = get_dfs(get_dir_name("may"))
june_df = get_dfs(get_dir_name("june"))
july_df = get_dfs(get_dir_name("july"))

  building_name           start_time             end_time  total_rooms   
0        NAE-01  2020-05-01 00:00:00  2020-05-31 00:00:00           81  \

  features  line_limit  
0        h          10  
['data/may_2020\\Room 104.csv', 'data/may_2020\\Room 105.csv', 'data/may_2020\\Room 107.csv', 'data/may_2020\\Room 108.csv', 'data/may_2020\\Room 110.csv', 'data/may_2020\\Room 114.csv', 'data/may_2020\\Room 120.csv', 'data/may_2020\\Room 121.csv', 'data/may_2020\\Room 122.csv', 'data/may_2020\\Room 208.csv', 'data/may_2020\\Room 213.csv', 'data/may_2020\\Room 216.csv', 'data/may_2020\\Room 217.csv', 'data/may_2020\\Room 221.csv', 'data/may_2020\\Room 223.csv', 'data/may_2020\\Room 227.csv', 'data/may_2020\\Room 229.csv', 'data/may_2020\\Room 240.csv', 'data/may_2020\\Room 245.csv', 'data/may_2020\\Room 247.csv', 'data/may_2020\\Room 248.csv', 'data/may_2020\\Room 249.csv', 'data/may_2020\\Room 252.csv', 'data/may_2020\\Room 254.csv', 'data/may_2020\\Room 261.csv', 'data/may_2020\\Room 264.

In [4]:
may_df["104"].iloc[1000:1010]

Unnamed: 0_level_0,S.No,Timestamp,Zone Temperature,Warm/Cool Adjust,Supply Vol Press,Actual Cooling Setpt,Actual Heating Setpt,Actual Damper Position,Actual Supply Flow,Reheat Valve Command,Common Setpoint,Damper Command,Actual Sup Flow SP,Cooling Max Flow,Occupied Clg Min,Occupied Htg Flow,Cooling Command,Heating Command,Occupied Command,Occupied Status
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-05-04 11:25:00,1000,2020-05-04 11:25:00,74.013863,2.47019,0.001355,80.498283,68.498283,85.091705,63.800301,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:30:00,1001,2020-05-04 11:30:00,74.013863,2.47019,0.001417,80.507652,68.507652,12.069869,75.489189,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:35:00,1002,2020-05-04 11:35:00,74.013863,2.47019,0.001417,80.507652,68.507652,-112.401749,80.70163,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:40:00,1003,2020-05-04 11:40:00,74.013863,2.47019,0.001417,80.507652,68.507652,98.572052,57.063805,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:45:00,1004,2020-05-04 11:45:00,74.013863,2.47019,0.000493,80.499619,68.499619,-19.751091,63.800301,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:50:00,1005,2020-05-04 11:50:00,74.013863,2.47019,0.00154,80.502296,68.502296,-84.563324,61.63702,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 11:55:00,1006,2020-05-04 11:55:00,74.013863,2.47019,0.00154,80.499619,68.499619,85.218338,61.63702,0.425049,72.0,-0.49911,0.0,1170.0,351.0,351.0,0.0,0.0,1.0,1.0
2020-05-04 12:00:00,1007,2020-05-04 12:00:00,74.013863,2.47019,0.002341,76.507652,72.499619,106.244537,91.718666,0.425049,72.0,1.302269,351.0,1170.0,351.0,351.0,0.0,0.0,3.0,3.0
2020-05-04 12:05:00,1008,2020-05-04 12:05:00,74.013863,2.47019,0.002341,76.498283,72.498283,22.213974,87.167534,0.425049,72.0,1.302269,351.0,1170.0,351.0,351.0,0.0,0.0,3.0,3.0
2020-05-04 12:10:00,1009,2020-05-04 12:10:00,74.013863,2.47019,0.002341,76.507652,72.507652,-106.969429,85.596481,0.425049,72.0,1.302269,351.0,1170.0,351.0,351.0,0.0,0.0,3.0,3.0


In [5]:
# concatenate_dict
dfs = {key: pd.concat([may_df[key], june_df[key], july_df[key]]) for key in may_df}

In [6]:
def filter_dfs(dfs_list, features, resample_interval='30T'):
    def resample_clearNan(df, resample_interval):
        filtered_df = df[~(df['Zone Temperature'] > 100)]
        return filtered_df.dropna().resample(resample_interval).mean()
    filtered_dfs = [resample_clearNan(x, resample_interval)[features] for x in dfs_list]
    return filtered_dfs
rooms_list, dfs_list = zip(*dfs.items())

In [7]:
def get_temps(dfs_list, resample_interval='30T'):
    list_of_features = ['Zone Temperature']
    temps_x_time_dfs_list = filter_dfs(dfs_list, list_of_features, resample_interval)
    merged_df = temps_x_time_dfs_list[0]
    merged_df = merged_df.rename(columns={'Zone Temperature':f'Zone Temperature_{rooms_list[0]}'})
    for i in range(1, len(temps_x_time_dfs_list)):
        merged_df = pd.merge(merged_df, temps_x_time_dfs_list[i], left_index=True, right_index=True, suffixes=(f"_{rooms_list[i-1]}", f"_{rooms_list[i]}"))
    merged_df = merged_df.dropna()
    merged_df[merged_df.isna().any(axis=1)]
    return merged_df

def get_room_features(dfs_list, resample_interval='30T'):
    list_of_features = dfs_list[0].columns.drop('S.No')
    # print(list_of_features)
    features_rooms = filter_dfs(dfs_list, list_of_features, resample_interval)
    # print(features_rooms[0]['Timestamp'][0])

    def series_convert_time_to_num(timestamp):
        return timestamp.dt.hour + timestamp.dt.minute/60 + timestamp.dt.second/3600

    def away_from_time(num, time=2):
        assert time <= 12 and time >= 0
        if num > 12 + time:
            num = 24 - num + time
        else:
            num = np.abs(num - time)
        return num

    for room in features_rooms:
        room['Timestamp'] = series_convert_time_to_num(room['Timestamp']).apply(lambda x: away_from_time(x, 2))
    
    rooms_and_features_list = list()
    for room in features_rooms:
        room = room.dropna()
        rooms_and_features_list.append(room)
    return rooms_and_features_list

In [8]:
def get_data_output(rooms_list, resample_interval='30T'):
    array_room = np.array(rooms_list)
    rooms_and_features = get_room_features(dfs_list, resample_interval)
    array_features_rooms = np.array(rooms_and_features)
    print(array_features_rooms.shape)
    array_features_rooms = np.swapaxes(array_features_rooms, 0, 1) 
    array_temps = np.array(get_temps(dfs_list, resample_interval))
    dir_path = f"preprocessing_output/{resample_interval}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    np.save(f"preprocessing_output/merged_rooms_list.npy", array_room)

    # Split to training and validation data sets
    features_rooms_training, features_room_testing = train_test_split(array_features_rooms, test_size=0.2)
    temps_training, temps_testing = train_test_split(array_temps, test_size=0.2)
    np.save(f"{dir_path}/features_rooms_training_{resample_interval}.npy", features_rooms_training)
    np.save(f"{dir_path}/features_rooms_testing_{resample_interval}.npy", features_room_testing)
    np.save(f"{dir_path}/temps_training_{resample_interval}.npy", temps_training)
    np.save(f"{dir_path}/temps_testing_{resample_interval}.npy", temps_testing)

    print(f"array_features_size: {array_features_rooms.shape}, array_temps_size: {array_temps.shape}")

    return features_rooms_training, features_room_testing, temps_training, temps_testing

In [10]:
a,b,c,d = get_data_output(rooms_list=rooms_list, resample_interval='30T')
e,f,g,h = get_data_output(rooms_list=rooms_list, resample_interval='60T')
i,j,k,l = get_data_output(rooms_list=rooms_list, resample_interval='120T')

(80, 4148, 19)
array_features_size: (4148, 80, 19), array_temps_size: (4148, 80)
(80, 2077, 19)
array_features_size: (2077, 80, 19), array_temps_size: (2077, 80)
(80, 1040, 19)
array_features_size: (1040, 80, 19), array_temps_size: (1040, 80)
