In [1]:
%pip install -U kaleido
%pip list | grep kaleido
%pip install pydot
%pip install graphviz
%pip install keras_tuner

import numpy as np
import pandas as pd
import os
import csv
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from keras_tuner import RandomSearch, HyperParameters
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold
import keras_tuner as kt

train_dir = "/Users/lina/Desktop/ba-implementation/train"
filtered_dir = "/Users/lina/Desktop/ba-implementation/train/5d27075f03f801723c2e360f/F1_filtered/"

def evaluate_model(predictions, actual_classes, encoder):
    # Get top 5 sorted predictions
    sorted_predictions = np.argsort(predictions, axis=1)[:, -5:]
    
    accuracies = []
    for i in range(1, 6):
        top_i_predicted_indices = sorted_predictions[:, -i:]
        is_correct_top_i = np.any(top_i_predicted_indices == actual_classes[:, None], axis=1)
        accuracies.append(np.mean(is_correct_top_i))
    
    # Decode the indices back to their original class names
    decoded_actual_classes = encoder.inverse_transform(actual_classes)
    decoded_top_predictions = [encoder.inverse_transform(row) for row in sorted_predictions]
    
    for i in range(len(decoded_actual_classes)):
        for j, decoded_prediction in enumerate(decoded_top_predictions[i], 1):
            print(f"Actual: {decoded_actual_classes[i]}, Top {j} Predicted: {decoded_prediction}")
    
    for i, accuracy in enumerate(accuracies, 1):
        print(f"Top-{i} Accuracy: {accuracy * 100:.2f}%")
    
    return tuple(accuracies)

def save_accuracy_plot(window_size, batch_size, epochs, lstm_units, accuracies):

    accuracies = [round(accuracy * 100, 2) for accuracy in accuracies]
    labels = ["Top 1", "Top 2", "Top 3", "Top 4", "Top 5"]
    title_str = f"Accuracies of the Model for epochs = {epochs},\n window_size = {window_size}, batch_size={batch_size}, and lstm_units = {lstm_units}."
    
    plt.bar(labels, accuracies, color=['blue', 'green', 'red', 'purple', 'orange'])
    plt.title(title_str)
    plt.xlabel('Top-N')
    plt.ylabel('Accuracy (%)')
    plt.ylim(0, 100)
    for i, v in enumerate(accuracies):
        plt.text(i, v + 2, f"{v}%", ha='center', va='bottom', fontsize=9)

    filename = f"prediction_ml_model.pdf"
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    
    plt.show()

Note: you may need to restart the kernel to use updated packages.
kaleido                   0.2.1
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Tune hyperparameters

In [None]:
def clearvars():    
    for el in sorted(globals()):
        if '__' not in el:
                print(f'deleted: {el}')
                del el
clearvars()

In [17]:
# Define sizes
epochs = 100
batch_size = 32
window_size = 3

def data_preprocessing(window_size):

    data_list = pd.DataFrame()
    lengths = []  # To store the lengths of each file

    for file_number in range(147):  # loop through all files
        # Load the data
        data = pd.read_csv(filtered_dir + f'floor_metric_interpolated_waypoints_acce_{file_number}_merged_data_acce.csv')
        if data.shape[0] <= window_size:
            continue
        data_list = pd.concat([data_list, data], ignore_index=True)
        lengths.append(len(data))

    # Create a target variable representing the BSSID with the highest RSSI value at each timestep
    data_list['target'] = data_list.iloc[:, 6:-1].idxmax(axis=1)

    # Initialize the LabelEncoder
    encoder = LabelEncoder()

    # Encode the target variable to integers for use with categorical crossentropy
    encoded_target = encoder.fit_transform(data_list['target'][window_size:])
    print(encoded_target)

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler(feature_range=(-1, 1))

    # Scale the data
    scaled_data = scaler.fit_transform(data_list.iloc[:, 6:-1])

    X_list = []
    y_list = []
    start_index = 0

    for length in lengths:
        for i in range(start_index, start_index + length - window_size):
            if i + window_size > len(scaled_data) or i >= len(encoded_target):
                print(f"Error at i: {i}, start_index: {start_index}, length: {length}")
                break
            X_list.append(scaled_data[i:i + window_size, :])
            y_list.append(encoded_target[i])
        start_index += length

    X = np.array(X_list)
    y = to_categorical(y_list)

    # Shuffle data before splitting
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    X = X[indices]
    y = y[indices]

    # Split the data into a training and testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test, encoder

def build_model(hp):

    TIMESTAMPS = X_train.shape[1]
    NUM_FEATURES = X_train.shape[2]

    model = Sequential()

    # Add the first LSTM layer
    if hp.Boolean('second_lstm_layer'):
        model.add(LSTM(
            units=hp.Int('lstm_units_1', min_value=64, max_value=1024, step=64),
            activation='tanh',
            return_sequences=True,
            input_shape=(TIMESTAMPS, NUM_FEATURES))
        )
    else:
        model.add(LSTM(
            units=hp.Int('lstm_units_1', min_value=64, max_value=1024, step=64),
            activation='tanh',
            return_sequences=False,
            input_shape=(TIMESTAMPS, NUM_FEATURES))
        )

    # Conditionally add a second LSTM layer
    if hp.Boolean('second_lstm_layer'):
        model.add(LSTM(
            units=hp.Int('lstm_units_2', min_value=64, max_value=1024, step=64),
            activation='tanh',
            return_sequences=False
        ))

    # Conditionally add Dropout layer
    if hp.Boolean('dropout'):
        model.add(Dropout(hp.Float('dropout_rate', min_value=0.0, max_value=0.5, step=0.05)))

    # Conditionally add Batch Normalization
    if hp.Boolean('batch_norm'):
        model.add(tf.keras.layers.BatchNormalization())

    model.add(Dense(y_train.shape[1], activation='softmax'))
    
    lr = 0.001
    # Learning rate tuning
    if hp.Boolean('learning_rate'):
        lr = hp.Float('learning_rate', min_value=1e-6, max_value=1e-4, step=1e-6)
    
    # Choose the optimizer and set the learning rate
    optimizer_choice = hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop'])

    if optimizer_choice == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    elif optimizer_choice == 'sgd':
        optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
    else:
        optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)

    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    model.summary()
    return model

X_train, X_test, y_train, y_test, encoder = data_preprocessing(window_size)

# Define hyperparameters
hp = HyperParameters()

tuner = RandomSearch(
    build_model,
    objective='accuracy',
    max_trials=25,
    executions_per_trial=1,
    hyperparameters=hp,
    directory='/Users/lina/Desktop/ba-implementation/',
    project_name='tuning_parameters'
)

tuner.search_space_summary()

# Modify the tuner.search call to use the defined hyperparameters for batch size
tuner.search(X_train, y_train,
             epochs=epochs,
             validation_data=(X_test, y_test),
             batch_size=hp.Int('batch_size', 16, 128, step=16))

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

tuner.results_summary()


Trial 25 Complete [00h 32m 47s]
accuracy: 0.21399345993995667

Best accuracy So Far: 0.8637479543685913
Total elapsed time: 11h 44m 15s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


Results summary
Results in /Users/lina/Desktop/ba-implementation/tuning_parameters
Showing 10 best trials
Objective(name="accuracy", direction="max")

Trial 18 summary
Hyperparameters:
second_lstm_layer: False
lstm_units_1: 512
dropout: False
batch_norm: True
learning_rate: False
optimizer: sgd
batch_size: 96
dropout_rate: 0.30000000000000004
lstm_units_2: 128
Score: 0.8637479543685913

Trial 23 summary
Hyperparameters:
second_lstm_layer: True
lstm_units_1: 960
dropout: True
batch_norm: True
learning_rate: False
optimizer: sgd
batch_size: 80
dropout_rate: 0.05
lstm_units_2: 768
Score: 0.841653048992157

Trial 11 summary
Hyperparameters:
second_lstm_layer: False
lstm_units_1: 192
dropout: False
batch_norm: True
learning_rate: False
optimizer: sgd
batch_size: 112
dropout_rate: 0.2
lstm_units_2: 320
Score: 0.7913256883621216

Trial 06 summary
Hyperparameters:
second_lstm_layer: False
lstm_units_1: 64
dropout: False
batch_norm: True
learning_rate: False
optimizer: adam
batch_size: 64
dropo

## Train the model

In [4]:
epochs = 100
lstm_units = 512
batch_size = 96
window_size = 3

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))

# Initialize the LabelEncoder
encoder = LabelEncoder()

data_list = pd.DataFrame()
lengths = []

N = 0
for file in os.listdir(filtered_dir):
    if file.startswith("floor_metric_interpolated_waypoints_acce") and file.endswith(".npy"):
        N += 1

for file_number in range(N):
    data = pd.read_csv(filtered_dir + f'floor_metric_interpolated_waypoints_acce_{file_number}_merged_data_acce.csv')
    if data.shape[0] <= window_size:
        continue
    data_list = pd.concat([data_list, data], ignore_index=True)
    lengths.append(len(data))

# Create a target variable representing the BSSID with the highest RSSI value at each timestep
data_list['target'] = data_list.iloc[:, 6:-1].idxmax(axis=1)

# Encode the target variable to integers for use with categorical crossentropy
encoded_target = encoder.fit_transform(data_list['target'][window_size:])

# Scale the data
scaled_data = scaler.fit_transform(data_list.iloc[:, 6:-1])

X_list = []
y_list = []
start_index = 0

for length in lengths:
    for i in range(start_index, start_index + length - window_size):
        if i + window_size > len(scaled_data) or i >= len(encoded_target):
            print(f"Error at i: {i}, start_index: {start_index}, length: {length}")
            break
        X_list.append(scaled_data[i:i + window_size, :])
        y_list.append(encoded_target[i])
    start_index += length

X = np.array(X_list)
y = to_categorical(y_list)

# Shuffle data before splitting
indices = np.arange(X.shape[0])
np.random.shuffle(indices)

X = X[indices]
y = y[indices]

# Split the data into a training and testing set using k-fold
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_number = 1

# Lists to store evaluation metrics for each fold
top1_list = []
top2_list = []
top3_list = []
top4_list = []
top5_list = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    TIMESTAMPS = X_train.shape[1]
    NUM_FEATURES = X_train.shape[2]

    # Define the LSTM model
    model = Sequential()
    model.add(LSTM(
                units=lstm_units,
                activation='tanh',
                kernel_regularizer=tf.keras.regularizers.l2(0.008),
                recurrent_regularizer=tf.keras.regularizers.l2(0.002),
                input_shape=(TIMESTAMPS, NUM_FEATURES))
            )

    model.add(Dropout(0.3))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(Dense(y_train.shape[1], activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.SGD(), loss='categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

    # Generate predictions on the test set
    predictions = model.predict(X_test)

    # The predictions are probabilities for each class, so we'll take the class with the highest probability as the prediction
    predicted_classes = np.argmax(predictions, axis=1)

    # The actual classes are one-hot encoded in y_test, so we'll convert them back to class labels
    actual_classes = np.argmax(y_test, axis=1)

    # Evaluate the model's performance on the test set
    accuracies = evaluate_model(predictions, actual_classes, encoder)

    # Save or append the evaluation metrics
    top1_list.append(accuracies[0])
    top2_list.append(accuracies[1])
    top3_list.append(accuracies[2])
    top4_list.append(accuracies[3])
    top5_list.append(accuracies[4])

    fold_number += 1

# After all folds are done, you can calculate and print the average of the evaluation metrics across all folds
average_top1 = np.mean(top1_list)
average_top2 = np.mean(top2_list)
average_top3 = np.mean(top3_list)
average_top4 = np.mean(top4_list)
average_top5 = np.mean(top5_list)

print(f"Average Top 1: {average_top1}")
print(f"Average Top 2: {average_top2}")
print(f"Average Top 3: {average_top3}")
print(f"Average Top 4: {average_top4}")
print(f"Average Top 5: {average_top5}")
accuracies = [average_top1, average_top2, average_top3, average_top4, average_top5]

save_accuracy_plot(window_size, batch_size, epochs, lstm_units, accuracies)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
 8/51 [===>..........................] - ETA: 2s - loss: 16.7336 - accuracy: 0.6432