In [1]:
import pandas as pd
import numpy as np
import glob as glob
import fnmatch
import re
from functools import reduce

In [2]:
# Available names = may, june, july
def get_dir_name(month, year=2020):
    return f"data/{month}_{year}/"

def get_dfs(data_dir):
    csv_files = glob.glob(data_dir + 'Room*.csv')

    def get_room_number(filename):
        pattern = r'Room (\d{3})'
        match = re.search(pattern, filename)
        if match:
            number = match.group(1)
            return number
    exclude_pattern = "*271*"
    filtered_csv_files = [file for file in csv_files if not fnmatch.fnmatch(file, exclude_pattern)]
    print(filtered_csv_files)
    room_numbers = []
    for filename in filtered_csv_files:
        room_numbers.append(get_room_number(filename))
    print(room_numbers)
    print("--- Dataframe processing ---")
    dataframes = dict()
    for file in filtered_csv_files:
        df = pd.read_csv(file)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df.set_index('Timestamp', inplace=True, drop=False)
        dataframes[get_room_number(file)] = df
    return dataframes

In [3]:
meta_data = pd.read_csv("data/may_2020/meta.csv")
print(meta_data)

may_df = get_dfs(get_dir_name("may"))
june_df = get_dfs(get_dir_name("june"))
july_df = get_dfs(get_dir_name("july"))

  building_name           start_time             end_time  total_rooms   
0        NAE-01  2020-05-01 00:00:00  2020-05-31 00:00:00           81  \

  features  line_limit  
0        h          10  
['data/may_2020\\Room 104.csv', 'data/may_2020\\Room 105.csv', 'data/may_2020\\Room 107.csv', 'data/may_2020\\Room 108.csv', 'data/may_2020\\Room 110.csv', 'data/may_2020\\Room 114.csv', 'data/may_2020\\Room 120.csv', 'data/may_2020\\Room 121.csv', 'data/may_2020\\Room 122.csv', 'data/may_2020\\Room 208.csv', 'data/may_2020\\Room 213.csv', 'data/may_2020\\Room 216.csv', 'data/may_2020\\Room 217.csv', 'data/may_2020\\Room 221.csv', 'data/may_2020\\Room 223.csv', 'data/may_2020\\Room 227.csv', 'data/may_2020\\Room 229.csv', 'data/may_2020\\Room 240.csv', 'data/may_2020\\Room 245.csv', 'data/may_2020\\Room 247.csv', 'data/may_2020\\Room 248.csv', 'data/may_2020\\Room 249.csv', 'data/may_2020\\Room 252.csv', 'data/may_2020\\Room 254.csv', 'data/may_2020\\Room 261.csv', 'data/may_2020\\Room 264.

In [4]:
# concatenate_dict
dfs = {key: pd.concat([may_df[key], june_df[key], july_df[key]]) for key in may_df}

In [5]:
def filter_dfs(dfs_list, features, resample_interval='30T'):
    def resample_clearNan(df, resample_interval):
        filtered_df = df[~(df['Zone Temperature'] > 100)]
        return filtered_df.dropna().resample(resample_interval).mean()
    filtered_dfs = [resample_clearNan(x, resample_interval)[features] for x in dfs_list]
    return filtered_dfs
rooms_list, dfs_list = zip(*dfs.items())

In [6]:
def get_temps(dfs_list, resample_interval='30T'):
    list_of_features = ['Zone Temperature']
    temps_x_time_dfs_list = filter_dfs(dfs_list, list_of_features, resample_interval)
    merged_df = temps_x_time_dfs_list[0]
    merged_df = merged_df.rename(columns={'Zone Temperature':f'Zone Temperature_{rooms_list[0]}'})
    for i in range(1, len(temps_x_time_dfs_list)):
        merged_df = pd.merge(merged_df, temps_x_time_dfs_list[i], left_index=True, right_index=True, suffixes=(f"_{rooms_list[i-1]}", f"_{rooms_list[i]}"))
    merged_df = merged_df.dropna()
    merged_df[merged_df.isna().any(axis=1)]
    return merged_df

def get_room_features(dfs_list, resample_interval='30T'):
    list_of_features = dfs_list[0].columns.drop('S.No')
    print(list_of_features)
    features_rooms = filter_dfs(dfs_list, list_of_features, resample_interval)
    # print(features_rooms[0]['Timestamp'][0])

    def series_convert_time_to_num(timestamp):
        return timestamp.dt.hour + timestamp.dt.minute/60 + timestamp.dt.second/3600

    def away_from_time(num, time=2):
        assert time <= 12 and time >= 0
        if num > 12 + time:
            num = 24 - num + time
        else:
            num = np.abs(num - time)
        return num

    for room in features_rooms:
        room['Timestamp'] = series_convert_time_to_num(room['Timestamp']).apply(lambda x: away_from_time(x, 2))
    
    rooms_and_features_list = list()
    for room in features_rooms:
        room = room.drop(room.index[-1]).dropna()
        rooms_and_features_list.append(room)
    return rooms_and_features_list

In [7]:
def get_data_output(rooms_list, resample_interval='30T'):
    array_room = np.array(rooms_list)
    rooms_and_features = get_room_features(dfs_list, resample_interval)
    array_features_rooms = np.array(rooms_and_features)
    array_temps = np.array(get_temps(dfs_list, resample_interval))
    np.save(f"preprocessing_output/merged_rooms_list.npy", array_room)
    np.save(f"preprocessing_output/merged_features_rooms_{resample_interval}.npy", array_features_rooms)
    np.save(f"preprocessing_output/merged_temps_time_{resample_interval}.npy", array_temps)
    return array_features_rooms, array_temps

In [8]:
a,b = get_data_output(rooms_list=rooms_list, resample_interval='30T')
get_data_output(rooms_list=rooms_list, resample_interval='60T')
get_data_output(rooms_list=rooms_list, resample_interval='120T')

Index(['Timestamp', 'Zone Temperature', 'Warm/Cool Adjust', 'Supply Vol Press',
       'Actual Cooling Setpt', 'Actual Heating Setpt',
       'Actual Damper Position', 'Actual Supply Flow', 'Reheat Valve Command',
       'Common Setpoint', 'Damper Command', 'Actual Sup Flow SP',
       'Cooling Max Flow', 'Occupied Clg Min', 'Occupied Htg Flow',
       'Cooling Command', 'Heating Command', 'Occupied Command',
       'Occupied Status'],
      dtype='object')
Index(['Timestamp', 'Zone Temperature', 'Warm/Cool Adjust', 'Supply Vol Press',
       'Actual Cooling Setpt', 'Actual Heating Setpt',
       'Actual Damper Position', 'Actual Supply Flow', 'Reheat Valve Command',
       'Common Setpoint', 'Damper Command', 'Actual Sup Flow SP',
       'Cooling Max Flow', 'Occupied Clg Min', 'Occupied Htg Flow',
       'Cooling Command', 'Heating Command', 'Occupied Command',
       'Occupied Status'],
      dtype='object')
Index(['Timestamp', 'Zone Temperature', 'Warm/Cool Adjust', 'Supply Vol Pres

(array([[[ 3.79166667e+00,  7.59069157e+01,  2.47019005e+00, ...,
           0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
         [ 5.08333333e+00,  7.53342688e+01,  2.47019005e+00, ...,
           0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
         [ 6.95833333e+00,  7.47698364e+01,  2.47019005e+00, ...,
           0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
         ...,
         [ 9.04166667e+00,  7.24322433e+01, -8.44084024e-01, ...,
           0.00000000e+00,  3.00000000e+00,  3.00000000e+00],
         [ 7.04166667e+00,  7.29996777e+01, -8.44084024e-01, ...,
           0.00000000e+00,  3.00000000e+00,  3.00000000e+00],
         [ 5.04166667e+00,  7.34405746e+01, -8.44084024e-01, ...,
           0.00000000e+00,  3.00000000e+00,  3.00000000e+00]],
 
        [[ 3.79166667e+00,  7.14588242e+01,  3.89255905e+00, ...,
           0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
         [ 5.08333333e+00,  7.12430147e+01,  3.89255905e+00, ...,
           4.77343618