In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Flatten, Dense,LayerNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras import backend as K
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score
from numpy import argmax
from tensorflow.keras.layers import MultiHeadAttention

def preprocess_data(df):
    # Remove columns where their last row is null
    df = df.drop(columns=df.columns[df.iloc[-1].isnull()])

    # Remove columns with more than 80% NaN values and fill others with mean
    threshold = 0.8 * len(df)
    df = df.dropna(thresh=threshold, axis=1)
    df = df.fillna(df.mean())

   # Compute the mean for each column
    column_means = df.mean()
    padding_len = 497 - len(df)
# Create padding using the mean values
    padding = pd.DataFrame([column_means] * padding_len)
# Concatenate padding and the original DataFrame
    df = pd.concat([padding, df], axis=0).reset_index(drop=True)
   
    return df

def load_data(*files):
    dataframes = []

    for file in files:
        # Load dataframe
        df = pd.read_csv(file, header=0)
        
        # Preprocess the data
        df = preprocess_data(df)
        # Add a prefix to each column name based on the file name
        prefix = file.split('.')[0]  # Assuming the file name is '11_2016.csv', this gets '11_2016'
        df.columns = [f"{prefix}_{col}" for col in df.columns]
        
        # Reset index after preprocessing to ensure unique indices
        df.reset_index(drop=True, inplace=True)

        dataframes.append(df)

    # Concatenate all preprocessed dataframes
    result = pd.concat(dataframes, axis=1)

    return result



# Build the LSTM model for multi-class classification
def build_classifier_model(input_shape, num_classes):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=490, input_shape=(1, 496), return_sequences=True)))     
    
    # Add another LSTM layer with 120 units
    model.add(LSTM(120, return_sequences=True))
    
    model.add(Flatten())
    model.add(Dense(num_classes, activation='softmax'))  # softmax for multi-class
    model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=['accuracy', f1_score])
    return model

# F1 Score Custom Metric
def f1_score(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision_val = precision(y_true, y_pred)
    recall_val = recall(y_true, y_pred)
    return 2 * ((precision_val * recall_val) / (precision_val + recall_val + K.epsilon()))

if __name__ == '__main__':
    # Load data
    files = ['11_2016.csv', '12_2016.csv', '01_2017.csv', '02_2017.csv', '03_2018.csv',
             '12_2017.csv', '01_2018.csv', '02_2018.csv', '03_2018.csv']
    df = load_data(*files)
    df = df.T
    # Setup KFold Cross Validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    f1scores = []

    for train_index, test_index in kfold.split(df):
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Data preprocessing
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Convert y_train and y_test to categorical
        y_train, y_test = y_train.astype(int), y_test.astype(int)
        y_train = to_categorical(y_train, num_classes=7)
        y_test = to_categorical(y_test, num_classes=7)

        # Reshape data for LSTM
        X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
        X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

        # Train the model
        model = build_classifier_model(X_train.shape[2], 7)
        model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=40, batch_size=64, verbose=0)

        # Evaluate the model
        loss, accuracy, f1score = model.evaluate(X_test, y_test, verbose=0)
        accuracies.append(accuracy)
        f1scores.append(f1score)

    print('Average Accuracy: %.2f%%' % (np.mean(accuracies) * 100))
    print('Average F1 Score: %.2f' % np.mean(f1scores))