In [152]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Flatten, Dense,LayerNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from numpy import argmax
from tensorflow.keras.layers import MultiHeadAttention

def preprocess_data(df):
    # Remove columns where their last row is null
    df = df.drop(columns=df.columns[df.iloc[-1].isnull()])

    # Remove columns with more than 80% NaN values and fill others with mean
    threshold = 0.8 * len(df)
    df = df.dropna(thresh=threshold, axis=1)
    df = df.fillna(df.mean())
    
    # Pad columns to have 496 rows, with last row unchanged
    padding_len = 497 - len(df)
    padding = pd.DataFrame(0, index=np.arange(padding_len), columns=df.columns)
    df = pd.concat([padding, df], axis=0)
    
    return df

def load_data(*files):
    dataframes = []

    for file in files:
        # Load dataframe
        df = pd.read_csv(file, header=0)
        
        # Preprocess the data
        df = preprocess_data(df)
        # Add a prefix to each column name based on the file name
        prefix = file.split('.')[0]  # Assuming the file name is '11_2016.csv', this gets '11_2016'
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        
        # Reset index after preprocessing to ensure unique indices
        df.reset_index(drop=True, inplace=True)

        dataframes.append(df)

    # Concatenate all preprocessed dataframes
    result = pd.concat(dataframes, axis=1)

    return result



# Build the LSTM model for multi-class classification
def build_classifier_model(input_shape, num_classes):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=290, input_shape=(1, 496), return_sequences=True)))     
    #model.add(MultiHeadAttention(num_heads=2, key_dim=290))
    #model.add(LayerNormalization(epsilon=1e-6))  # Layer normalization can help stabilize the outputs
    # Add another LSTM layer with 120 units
    model.add(LSTM(120, return_sequences=True))
    
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))  # softmax for multi-class
    model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=['accuracy', f1_score])
    return model

# F1 Score Custom Metric
def f1_score(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_val = precision(y_true, y_pred)
    recall_val = recall(y_true, y_pred)
    return 2 * ((precision_val * recall_val) / (precision_val + recall_val + K.epsilon()))




# Main script execution
if __name__ == '__main__':
    # Load data
    # Load data
    files = ['11_2016.csv', '12_2016.csv', '01_2017.csv', '02_2017.csv', '03_2018.csv',
         '12_2017.csv', '01_2018.csv', '02_2018.csv', '03_2018.csv']
    df = load_data(*files)
    
    # Handle missing values, for example, by replacing them
    #df.fillna(method='ffill', inplace=True)  # Forward fill
    # Assuming single_file_result is your dataframe

# Get unique values from row 496
    unique_values_row_496 = df.iloc[496].unique()

# Filter out the expected values
    unexpected_values = [value for value in unique_values_row_496 if value not in [-3, -2, -1, 0, 1, 2, 3]]

    print(df.tail())
    df = df.T
    caler2 = StandardScaler()
    # Use the initial 200 rows for training
    training_set = df.iloc[:220, :]
    X_train = training_set.iloc[:, :-1].values
    X_train = scaler2.fit_transform(X_train)
    y_train = training_set.iloc[:, -1].values
    # Reshape the X_train
    num_samples_train, num_features_train = X_train.shape
    X_train = np.reshape(X_train, (num_samples_train, 1, num_features_train))
    # Prepare the testing set, using the remaining rows (from 200 to 268)
    testing_set = df.iloc[220:268, :-1]
    X_test = scaler2.fit_transform(testing_set)

    
    y_test = df.iloc[220:268, -1].values
    #   Reshape the X_test
    num_samples_test = X_test.shape[0]
    X_test = np.reshape(X_test, (num_samples_test, 1, 496))
    # Convert y_train and y_test to categorical
    num_classes = len(np.unique(y_train))
    y_train = y_train.astype(int)
    num_classes2 = len(np.unique(y_test))
    y_test = y_test.astype(int)
    # Convert labels to categorical
    y_train = to_categorical(y_train, num_classes=7)  # Assuming 5 classes
    y_test = to_categorical(y_test, num_classes=7)
    # Train the model
    model = build_classifier_model(496,7) # Assuming you have a function called build_classifier_model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=40, batch_size=64)

    # You can collect metrics or save models, weights, etc., during/after each iteration if required.

    loss, accuracy, f1score = model.evaluate(X_test, y_test, verbose=0)
    print('Test Accuracy: %.2f%%' % (accuracy * 100))
    print('Test F1 Score: %.2f' % f1score)
   


     11_2016_bedroom_ID1  11_2016_bedroom_ID12  11_2016_bedroom_ID14  \
492               16.936                18.746                19.983   
493               16.623                18.746                20.046   
494               16.372                18.809                20.108   
495               16.122                18.934                20.108   
496                3.000                 1.000                 1.000   

     11_2016_bedroom_ID16  11_2016_bedroom_ID23  11_2016_bedroom_ID26  \
492                22.181                19.503                19.751   
493                21.493                20.191                18.938   
494                21.993                20.566                19.438   
495                22.431                20.629                19.751   
496                 1.000                 1.000                 1.000   

     11_2016_bedroom_ID33  11_2016_bedroom_ID41  11_2016_living_room_ID1  \
492                14.333                16.436     

In [155]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Flatten, Dense,LayerNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score
from numpy import argmax
from tensorflow.keras.layers import MultiHeadAttention

def preprocess_data(df):
    # Remove columns where their last row is null
    df = df.drop(columns=df.columns[df.iloc[-1].isnull()])

    # Remove columns with more than 80% NaN values and fill others with mean
    threshold = 0.8 * len(df)
    df = df.dropna(thresh=threshold, axis=1)
    df = df.fillna(df.mean())
    
    # Pad columns to have 496 rows, with last row unchanged
    padding_len = 497 - len(df)
    padding = pd.DataFrame(0, index=np.arange(padding_len), columns=df.columns)
    df = pd.concat([padding, df], axis=0)
    
    return df

def load_data(*files):
    dataframes = []

    for file in files:
        # Load dataframe
        df = pd.read_csv(file, header=0)
        
        # Preprocess the data
        df = preprocess_data(df)
        # Add a prefix to each column name based on the file name
        prefix = file.split('.')[0]  # Assuming the file name is '11_2016.csv', this gets '11_2016'
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        
        # Reset index after preprocessing to ensure unique indices
        df.reset_index(drop=True, inplace=True)

        dataframes.append(df)

    # Concatenate all preprocessed dataframes
    result = pd.concat(dataframes, axis=1)

    return result



# Build the LSTM model for multi-class classification
def build_classifier_model(input_shape, num_classes):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=290, input_shape=(1, 496), return_sequences=True)))     
    #model.add(MultiHeadAttention(num_heads=2, key_dim=290))
    #model.add(LayerNormalization(epsilon=1e-6))  # Layer normalization can help stabilize the outputs
    # Add another LSTM layer with 120 units
    model.add(LSTM(120, return_sequences=True))
    
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))  # softmax for multi-class
    model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=['accuracy', f1_score])
    return model

# F1 Score Custom Metric
def f1_score(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_val = precision(y_true, y_pred)
    recall_val = recall(y_true, y_pred)
    return 2 * ((precision_val * recall_val) / (precision_val + recall_val + K.epsilon()))

if __name__ == '__main__':
    # Load data
    files = ['11_2016.csv', '12_2016.csv', '01_2017.csv', '02_2017.csv', '03_2018.csv',
             '12_2017.csv', '01_2018.csv', '02_2018.csv', '03_2018.csv']
    df = load_data(*files)
    df = df.T
    # Setup KFold Cross Validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = []
    f1scores = []

    for train_index, test_index in kfold.split(df):
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Data preprocessing
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Convert y_train and y_test to categorical
        y_train, y_test = y_train.astype(int), y_test.astype(int)
        y_train = to_categorical(y_train, num_classes=7)
        y_test = to_categorical(y_test, num_classes=7)

        # Reshape data for LSTM
        X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

        # Train the model
        model = build_classifier_model(X_train.shape[2], 7)
        model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=40, batch_size=64, verbose=0)

        # Evaluate the model
        loss, accuracy, f1score = model.evaluate(X_test, y_test, verbose=0)
        accuracies.append(accuracy)
        f1scores.append(f1score)

    print('Average Accuracy: %.2f%%' % (np.mean(accuracies) * 100))
    print('Average F1 Score: %.2f' % np.mean(f1scores))




Average Accuracy: 54.84%
Average F1 Score: 0.55


In [153]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Flatten, Dense,LayerNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from numpy import argmax
from tensorflow.keras.layers import MultiHeadAttention

def preprocess_data(df):
    # Remove columns where their last row is null
    df = df.drop(columns=df.columns[df.iloc[-1].isnull()])

    # Remove columns with more than 80% NaN values and fill others with mean
    threshold = 0.8 * len(df)
    df = df.dropna(thresh=threshold, axis=1)
    df = df.fillna(df.mean())
    
    # Pad columns to have 496 rows, with last row unchanged
    padding_len = 498 - len(df)
    padding = pd.DataFrame(0, index=np.arange(padding_len), columns=df.columns)
    df = pd.concat([padding, df], axis=0)
    
    return df



def get_mean_ext(year, month, house_id, temp_df):
    # Create a mask to filter rows based on year and month
    mask = (pd.to_datetime(temp_df['Date']).dt.year == year) & \
           (pd.to_datetime(temp_df['Date']).dt.month == month)
    
    # Check if the specific HouseID exists in the dataset
    if house_id in temp_df['HouseID'].unique():
        # If it exists, retrieve the t_mean_ext value for the given month, year, and HouseID
        specific_id_values = temp_df[mask & (temp_df['HouseID'] == house_id)]['t_mean_ext'].values
        if specific_id_values.size > 0:
            return specific_id_values[0]
    
    # If the specific HouseID doesn't exist or there's no value for the given month and year, 
    # compute the mean t_mean_ext across all HouseIDs for that month and year
    all_id_values = temp_df[mask]['t_mean_ext'].values
    if all_id_values.size > 0:
        return np.mean(all_id_values)
    else:
        # If no records are found at all for the given month and year, return 0
        return 0




def load_data(temperature_file, *files):
    dataframes = []

    # Load Temperatures.csv
    temp_df = pd.read_csv("Winter_thermal_comfort_dataset/Temperatures.csv")

    for file in files:
        # Extract year, month, and HouseID from the filename
        split_name = file.split('.')[0].split('_')
        year = int(split_name[1])
        month = int(split_name[0])
        house_id = int(split_name[-1].replace('ID', ''))

        # Load dataframe
        df = pd.read_csv(file, header=0)
        
        # Preprocess the data
        df = preprocess_data(df)

        # Add a prefix to each column name based on the file name
        prefix = file.split('.')[0]
        df.columns = [f"{prefix}_{col}" for col in df.columns]

        # Get t_mean_ext value
        mean_ext_value = get_mean_ext(year, month, house_id, temp_df)

        # Construct the new row
        new_row_data = []
        for col in df.columns:
            if f"ID{house_id}" in col and f"_{month}_" in col and f"_{year}_" in col:
                # If the column matches the current ID, month, and year, set its value to t_mean_ext
                new_row_data.append(mean_ext_value)
            else:
                # Otherwise, just duplicate the value from the second to last row of the original df
                new_row_data.append(df.iloc[-2][col])
        new_row = pd.DataFrame([new_row_data], columns=df.columns)

        # Split the dataframe, concatenate, and reset index
        df1 = df.iloc[:-1]
        df2 = df.iloc[-1:]
        df = pd.concat([df1, new_row, df2], axis=0).reset_index(drop=True)

        dataframes.append(df)

    # Concatenate all the dataframes in the list
    result = pd.concat(dataframes, axis=1)
    return result










# Build the LSTM model for multi-class classification
def build_classifier_model(input_shape, num_classes):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=290, input_shape=(1, 497), return_sequences=True)))     
    #model.add(MultiHeadAttention(num_heads=2, key_dim=290))
    #model.add(LayerNormalization(epsilon=1e-6))  # Layer normalization can help stabilize the outputs
    # Add another LSTM layer with 120 units
    model.add(LSTM(120, return_sequences=True))
    
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))  # softmax for multi-class
    model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=['accuracy', f1_score])
    return model

# F1 Score Custom Metric
def f1_score(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_val = precision(y_true, y_pred)
    recall_val = recall(y_true, y_pred)
    return 2 * ((precision_val * recall_val) / (precision_val + recall_val + K.epsilon()))




# Main script execution
if __name__ == '__main__':
    # Load data
    # Load data
    files = ['11_2016.csv', '12_2016.csv', '01_2017.csv', '02_2017.csv', '03_2018.csv',
         '12_2017.csv', '01_2018.csv', '02_2018.csv', '03_2018.csv']
    df = load_data(*files)
    
    # Handle missing values, for example, by replacing them
    #df.fillna(method='ffill', inplace=True)  # Forward fill
    # Assuming single_file_result is your dataframe
    print(df.tail())
# Get unique values from row 496
    unique_values_row_496 = df.iloc[497].unique()

# Filter out the expected values
    unexpected_values = [value for value in unique_values_row_496 if value not in [-3, -2, -1, 0, 1, 2, 3]]
    print(unexpected_values)
    df = df.T

    caler2 = StandardScaler()
    # Use the initial 200 rows for training
    training_set = df.iloc[:200, :]
    X_train = training_set.iloc[:, :-1].values
    X_train = scaler2.fit_transform(X_train)
    y_train = training_set.iloc[:, -1].values
    # Reshape the X_train
    num_samples_train, num_features_train = X_train.shape
    X_train = np.reshape(X_train, (num_samples_train, 1, num_features_train))
    # Prepare the testing set, using the remaining rows (from 200 to 268)
    testing_set = df.iloc[200:268, :-1]
    X_test = scaler2.fit_transform(testing_set)

    y_test = df.iloc[200:268, -1].values
    #   Reshape the X_test
    num_samples_test = X_test.shape[0]

    X_test = np.reshape(X_test, (num_samples_test, 1, 497))
    #print(num_samples_test)
    # Convert y_train and y_test to categorical
    num_classes = len(np.unique(y_train))
    y_train = y_train.astype(int)
    num_classes2 = len(np.unique(y_test))
    y_test = y_test.astype(int)
    # Convert labels to categorical
    #print(np.unique(y_train))

    y_train = to_categorical(y_train, num_classes=7)  # Assuming 7 classes
    y_test = to_categorical(y_test, num_classes=7)
    # Train the model
    model = build_classifier_model(497,7) # Assuming you have a function called build_classifier_model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=40, batch_size=64)

    # You can collect metrics or save models, weights, etc., during/after each iteration if required.

    loss, accuracy, f1score = model.evaluate(X_test, y_test, verbose=0)
    print('Test Accuracy: %.2f%%' % (accuracy * 100))
    print('Test F1 Score: %.2f' % f1score)

     12_2016_bedroom_ID1  12_2016_bedroom_ID12  12_2016_bedroom_ID14  \
494               14.430                19.872                20.359   
495               14.305                19.935                20.296   
496               14.180                19.935                20.108   
497               14.180                19.935                20.108   
498                1.000                -2.000                 1.000   

     12_2016_bedroom_ID16  12_2016_bedroom_ID23  12_2016_bedroom_ID26  \
494                22.681                20.816                17.874   
495                22.994                20.754                17.687   
496                23.119                20.816                17.499   
497                23.119                20.816                17.499   
498                 1.000                 1.000                 1.000   

     12_2016_bedroom_ID33  12_2016_bedroom_ID41  12_2016_living_room_ID1  \
494                18.027                17.563     

ValueError: cannot reshape array of size 25398 into shape (51,1,497)