In [3]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
# Deafen the particular pandas warnings
pd.options.mode.chained_assignment = None

In [4]:
# Import the datasets
features_0 = pd.read_csv("Features.csv")
features_1 = pd.read_csv("Features2022.csv")
features_2 = pd.read_csv("Features2022_1.csv")
energy_1 = pd.read_csv("Energy2022.csv")
energy_2 = pd.read_csv("Energy2022_1.csv")
energy_0 = pd.read_excel("Energy.xlsx", header = 1)
humidity = pd.read_csv("Humidity.csv")
irradiance = pd.read_csv("Irradiance.csv")
energy_2.rename(columns={'Time': 'Aika',
                        }, inplace=True)

#Use pd.concat([...]) to concatenate several datasets vertically

features = pd.concat([features_1, features_2])
energy = pd.concat([energy_1, energy_2])

# It is highly recommended to check for the duplicate values in the resulting datasets.
# However, it should be done alongside the time (Aika) column, but different datasets may not have the formats standardized.

In [5]:
# Uncomment this and the last line of this cell if concatenating energy_0 dataframe.
energy_0["Aika"] = energy_0["Aika"].str.replace("\+3", '', regex = True)
energy_0["Aika"] = energy_0["Aika"].str[3:-1]
energy_0["Aika"] = pd.to_datetime(energy_0.Aika, format = '%d.%m.%Y %H.%M.%S')
energy_0["Time"] = energy_0["Aika"].dt.tz_localize('Europe/Helsinki', ambiguous='infer').dt.tz_convert('UTC')

# Convert Time Strings to Datetime objects. Uncomment other blocks if using extra datasets and comment this one.
energy["Aika"] = pd.to_datetime(energy.Aika, format = '%Y-%m-%d %H:%M:%S')
energy["Time"] = energy["Aika"].dt.tz_localize('Europe/Helsinki', ambiguous='infer').dt.tz_convert('UTC')

energy = pd.concat([energy, energy_0]).drop_duplicates().sort_values("Time").reset_index(drop=True)

In [6]:
# From the features, select the observations with temperatures and CO2 concentration
temperatures = features[['_S_talo_S111_TE',
       '_S_talo_S112_TE', '_S_talo_S114_TE', '_S_talo_S115_TE',
       '_S_talo_S116_TE', '_S_talo_S117_TE', '_S_talo_S119_TE',
       '_S_talo_S125_TE', '_S_talo_S201_TE', '_S_talo_S202_TE',
       '_S_talo_S210_TE', '_S_talo_S213_TE', '_S_talo_S213b_TE',
       '_S_talo_S214_TE', '_S_talo_S214b_TE', '_S_talo_S215_TE',
       '_S_talo_S301_TE', '_S_talo_S307_TE', '_S_talo_S310_TE',
       '_S_talo_S311_TE', '_S_talo_S311b_TE', '_S_talo_S312_TE',
       '_S_talo_S313_TE', '_S_talo_S316_TE']]

qualities = features[['_S_talo_S111_QE', '_S_talo_S112_QE', '_S_talo_S114_QE',
       '_S_talo_S115_QE', '_S_talo_S116_QE', '_S_talo_S117_QE',
       '_S_talo_S119_QE', '_S_talo_S125_QE', '_S_talo_S201_QE',
       '_S_talo_S202_QE', '_S_talo_S210_QE', '_S_talo_S213_QE',
       '_S_talo_S213b_QE', '_S_talo_S214_QE', '_S_talo_S214b_QE',
       '_S_talo_S215_QE', '_S_talo_S301_QE', '_S_talo_S307_QE',
       '_S_talo_S310_QE', '_S_talo_S311_QE', '_S_talo_S311b_QE',
       '_S_talo_S312_QE', '_S_talo_S313_QE', '_S_talo_S316_QE']]

# Multiply some room observations of CO2 concentration to normalize to them scale
for i in ["_S_talo_S111_QE", "_S_talo_S114_QE", "_S_talo_S115_QE",
          "_S_talo_S116_QE", "_S_talo_S117_QE", "_S_talo_S119_QE", 
          "_S_talo_S213b_QE", "_S_talo_S214b_QE"]:
    qualities.loc[:, i] = qualities.loc[:, i] * 1000
    
# Calculate the mean of the room observations  
temperatures.loc[:, "Inside_temperature"] = temperatures.mean(1)   
qualities.loc[:, "CO2_concentration"] = qualities.mean(1)

# Select and rename the remaining sensors
features = features[['Time', '_S_talo_S_LV02_TE03', '_S_talo_S_LV01_TE03',
       '_S_talo_S_LIV01_TE03', '_S_talo_U_TE90', '_S_talo_S_KLV01_TE01',
       '_S_talo_S_LKV01_TV01', '_S_talo_S_LKV01_TE03']]

features.rename(columns={'_S_talo_S_LV02_TE03': 'Floor_network_2_temperature', 
                         '_S_talo_S_LV01_TE03': 'Radiator_network_1_temperature',
                         "_S_talo_S_LIV01_TE03": "Ventilation_network_1_temperature",
                         "_S_talo_U_TE90": "Outside_temperature_1",
                         "_S_talo_S_KLV01_TE01": "District_heat_temperature",
                         "_S_talo_S_LKV01_TV01": "Domestic_water_network_1_primary_valve",
                         "_S_talo_S_LKV01_TE03": "Domestic_water_network_1_temperature"
                        }, inplace=True)

# Merge the temperature, CO2 concentration and other features into the same dataset
features = pd.concat([features, temperatures["Inside_temperature"], qualities["CO2_concentration"]], axis = 1)

# Set the correct timezone (GMT) for the feature data
features["Time"] = pd.to_datetime(features.Time, format = '%Y-%m-%d %H:%M:%S')
features["Time"] = features["Time"].dt.tz_localize('UTC', ambiguous='infer')

# Set the correct timezone (Finnish) for the energy data
#energy["Aika"] = pd.to_datetime(energy.Aika, format = '%Y-%m-%d %H:%M:%S')
#energy["Time"] = energy["Aika"].dt.tz_localize('Europe/Helsinki', ambiguous='infer').dt.tz_convert('UTC')

# Merge the features and the energy into single dataset
result = pd.merge(features, energy[["kWh", "Time"]], how = "left", on = "Time")

# Rename the energy column
result.rename(columns = {"kWh" : "Energy_consumption"}, inplace = True)

# Merge the humidity and irradiance datasets with the general dataset
def data_merge(main_dataset, dataset, parameter):
    dataset["Aika"] = dataset["Year"].astype(str) + "-" + dataset["m"].astype(str) + "-" + dataset["d"].astype(str) + " " + dataset["Time"]
    dataset["Aika"] = pd.to_datetime(dataset.Aika, format = '%Y.%m.%d %H:%M')
    dataset["Time"] = dataset["Aika"].dt.tz_localize('UTC', ambiguous='infer')
    main_dataset = pd.merge(main_dataset, dataset[[parameter, "Time"]], how = "left", on = "Time")
    return main_dataset

result = data_merge(result, humidity, "Relative humidity (%)")
result = data_merge(result, irradiance, "Direct solar radiation (W/m2)")

# Rename the remaining columns
result.rename(columns = {"Direct solar radiation (W/m2)" : "Solar_irradiance", 
                           "Relative humidity (%)" : "Outside_humidity", 
                           "Outside_temperature_1" : "Outside_temperature_average"}, inplace = True)

# Detect and remove the missing window when all sensors show zero
datana = result[(result["Outside_temperature_average"] == 0) & (result["Floor_network_2_temperature"] == 0)]
result = result.drop(index = datana.index)
result.reset_index(inplace = True)

#Drop inplaced index
result = result.drop(columns = ["index"])

# Drop NAs
result = result.drop(index = result[result.isna().any(axis=1)].index)

# Remove observations when the network was under maintenance
result = result.drop(result[(result.Time >= "2022-02-11 12:00:00") & (result.Time < "2022-02-11 17:00:00")].index)

In [10]:
# Create a temperature delay column
def temp_delay(delay):
    '''
    Create a column with the previous delayed temperature with the specified time delay.
    Input: time delay as int
    '''
    result["Delayed_temperature"] = result["Inside_temperature"]
    for i in result.index.values[:-(delay)]:
        if (result["Time"].iloc[(i + delay)] - result["Time"].iloc[i]).components.hours == delay:
            result["Delayed_temperature"][i] = result["Inside_temperature"].iloc[i:i + delay + 1].shift(-delay)
    
temp_delay(4)

In [7]:
# Drop the NAs. 
#Sometimes it requires to check whether there are gaps in the resulted dataframe to see whether the resulting dataset does not have gaps to make the RNN intervals work properly.
result = result.dropna()

# Save data to csv
result.to_csv("s-building_data.csv", index = False)