# RNN Exploration & Analysis
**Author: Jibran**<br>
**Date: 2023-11-01**

## Imports & Setup

In [None]:
import logging
from pathlib import Path

import numpy as np
import torch
import yaml

import config as cpy
from src.data_preprocess.data_loader import DataLoader
from src.data_preprocess.data_saver import DataSaver
from src.data_preprocess.preprocessing import DataPreprocessor
from src.data_preprocess.rnn_data_prep import RNNDataPrep
from src.models import rnn_model
from src.models.train_eval import train_eval_model
from src.utils.utilities import create_config_dict
from src.visualization.rnn_visualize import plot_predicted_probabilities


### Set up logging

In [None]:
"""
The logging level can be set to one of the following:
DEBUG - Detailed information, typically of interest only when diagnosing problems.
INFO - Confirmation that things are working as expected.
WARNING - An indication that something unexpected happened, or indicative of some problem in the near future
(e.g. ‘disk space low’). The software is still working as expected.
ERROR - Due to a more serious problem, the software has not been able to perform some function.
CRITICAL - A serious error, indicating that the program itself may be unable to continue running.
"""
logging.basicConfig(
    level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s\n"
)


## Process Data and/or Get Train/Test Splits

### Get/set relevant configuration stuff

In [None]:
# ******************************************************************** #
#             GET POTENTIALLY RELEVANT PATHS FROM CONFIG.PY            #
# ******************************************************************** #
(
    PROJECT_ROOT,
    ff_mw_raw_data,
    processed_data_dir,
    rnn_input_data_dir,
    models_dir,
    configs_dir,
    logs_dir,
) = (
    cpy.PROJECT_ROOT,
    cpy.FF_MW_DATA_FILE,
    cpy.PROCESSED_DATA_DIR,
    cpy.RNN_INPUT_DATA_DIR,
    cpy.MODELS_DIR,
    cpy.CONFIGS_DIR,
    cpy.LOGS_DIR,
)

# ******************************************************************** #
#                     LOAD YAML CONFIGURATION FILE                     #
# ******************************************************************** #
yaml_path = Path(PROJECT_ROOT, "config.yaml")
with open(yaml_path, "r") as file:
    config = yaml.safe_load(file)

# ******************************************************************** #
#                   DETERMINE LOADING/SAVING OPTIONS                   #
# ******************************************************************** #
is_save_processed = True  # Save processed data to file
is_save_train_test = True  # Save train/test data to file
is_save_model_and_config = False  # Save model and config to file
# -------------------------------------------------------------------- #
# If there is already a saved processed data file path in the config,
# use it
processed_data_file = config["data_file_paths"]["processed"]
if processed_data_file:
    use_processed_data = True
    print("Using processed data file: ", processed_data_file)
else:
    use_processed_data = False
# -------------------------------------------------------------------- #
train_test_data_dir = config["data_file_paths"]["rnn_input_dir"]
if train_test_data_dir:
    is_load_train_test_data = True
    print("Using train/test data file: ", train_test_data_dir)
else:
    is_load_train_test_data = False
# -------------------------------------------------------------------- #
model_file = config["data_file_paths"]["model"]
model_config_file = config["data_file_paths"]["model_config"]
if model_file and model_config_file:
    is_load_model_and_config = True
    print("Using model file: ", model_file)
    print("Using model config file: ", model_config_file)
else:
    is_load_model_and_config = False
is_load_model_and_config = False  # TODO: Remove this line
# -------------------------------------------------------------------- #
data_saver = DataSaver()  # Initialize DataSaver object
# -------------------------------------------------------------------- #

# ******************************************************************** #
#                           DEFINE PARAMETERS                          #
# ******************************************************************** #
seq_len = 5


### Get preprocessed data

In [None]:
# Set data source as raw or processed
data_source_type = "processed" if use_processed_data else "raw"
print(f"Using {data_source_type} data\n\n")
data_source_path = (
    processed_data_file if use_processed_data else ff_mw_raw_data
)

preprocessor = DataPreprocessor().set_data_source(
    data_source_type, data_source_path
)
df = preprocessor.get_preprocessed_data()

# ======================= Optionally save data ======================= #
# Optionally save data
if is_save_processed and not use_processed_data:
    print("\nSaving processed data to file...\n")
    processed_data_file = data_saver.save_processed_data(df)
    # Update the config file with the new processed data file path
    config["data_file_paths"]["processed"] = str(processed_data_file)
    with open(yaml_path, "w") as file:
        yaml.dump(config, file)


### Get train/test data splits for RNN 

In [None]:
# Set data source as raw or processed
df_processed = df if not is_load_train_test_data else None
train_test_path = train_test_data_dir if is_load_train_test_data else None
data_source_type = "df" if df_processed is not None else "file"
print(f"Using {data_source_type} data\n\n")

rnn_data_preparer = RNNDataPrep().set_data_source(
    df_processed,
    train_test_path,
)
train_test_dict, test_indices = rnn_data_preparer.get_rnn_data(seq_len)
X_train, Y_train, X_test, Y_test = (
    train_test_dict["X_train"],
    train_test_dict["Y_train"],
    train_test_dict["X_test"],
    train_test_dict["Y_test"],
)
# ======================= Optionally save data ======================= #
# Optionally save data
if is_save_train_test and not is_load_train_test_data:
    print("\nSaving train/test data to file...\n")
    train_test_data_dir = data_saver.save_train_test_data(
        X_train, Y_train, X_test, Y_test, test_indices
    )
    # Update the config file with the new train/test data file path
    config["data_file_paths"]["rnn_input_dir"] = str(train_test_data_dir)
    with open(yaml_path, "w") as file:
        yaml.dump(config, file)


## Running some checks...

In [None]:
# Print shapes
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)


### Check data imbalance

In [None]:
# Check for data imbalance in Y_train and Y_test
# Note that the single feature in Y data is a binary classification
# 0: no walk
# 1: walk
logging.info("Checking for data imbalance...")
logging.info(f"Y_train: {np.unique(Y_train, return_counts=True)}")
logging.info(f"Y_test: {np.unique(Y_test, return_counts=True)}")


### Check for invalid values (NaN's and Inf's)

In [None]:
# num_nan_values_X_train = np.isnan(X_train).sum()
# num_nan_values_X_test = np.isnan(X_test).sum()
# num_nan_values_Y_train = np.isnan(Y_train).sum()
# num_nan_values_Y_test = np.isnan(Y_test).sum()
# print(f"Number of NaN values in train X set: {num_nan_values_X_train}")
# print(f"Number of NaN values in test X set: {num_nan_values_X_test}")
# print(f"Number of NaN values in train Y set: {num_nan_values_Y_train}")
# print(f"Number of NaN values in test Y set: {num_nan_values_Y_test}\n")

# num_inf_values_X_train = np.isinf(X_train).sum()
# num_inf_values_X_test = np.isinf(X_test).sum()
# num_inf_values_Y_train = np.isinf(Y_train).sum()
# num_inf_values_Y_test = np.isinf(Y_test).sum()
# print(f"Number of inf values in train X set: {num_inf_values_X_train}")
# print(f"Number of inf values in test X set: {num_inf_values_X_test}")
# print(f"Number of inf values in train Y set: {num_inf_values_Y_train}")
# print(f"Number of inf values in test Y set: {num_inf_values_Y_test}")


#### Replace NaN's and Inf's with 0

In [None]:
# X_train[np.isnan(X_train)] = 0
# X_test[np.isnan(X_test)] = 0
# Y_train[np.isnan(Y_train)] = 0
# Y_test[np.isnan(Y_test)] = 0

# X_train[np.isinf(X_train)] = 0
# X_test[np.isinf(X_test)] = 0
# Y_train[np.isinf(Y_train)] = 0
# Y_test[np.isinf(Y_test)] = 0

# # handle_inf_na(X_train, X_test)


## RNN Model

### Train model

In [None]:
# ======================== Train the RNN model ======================= #
# device = torch.device("cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
batch_size = (512 * 8) if device == torch.device("mps") else 512
num_epochs = 10
hidden_size = 32
output_size = 2
learning_rate = 0.01
batch_first = True
input_size = X_train.shape[2]
kwargs = {"num_workers": 4, "pin_memory": True}  # For GPU training
print(f"Training RNN Model ({device})...\n==============================\n")
model, test_labels_and_probs = train_eval_model(
    X_train,
    Y_train,
    X_test,
    Y_test,
    input_size,
    hidden_size,
    output_size,
    num_epochs,
    batch_size,
    learning_rate,
    device,
    batch_first=batch_first,
    **kwargs,
)


### Model Evaluation, Visualization, and Analysis

In [None]:
test_indices = rnn_data_preparer.test_indices
# print(test_indices)
print(f"Test indices shape: {test_indices.shape}")


In [None]:
print(
    f"test_true_labels shape: {test_labels_and_probs[0].shape}, \ntest_pred_labels shape: {test_labels_and_probs[1].shape}, \ntest_pred_probs shape: {test_labels_and_probs[2].shape}\n"
)
print(f"df shape: {df.shape}")


In [None]:
plot_df, mean_df = plot_predicted_probabilities(
    df, test_indices, test_labels_and_probs
)


### Save model and config

In [None]:
# Create the model name
model_architecture = "rnn"
# get the raw data id, in this case 'ff-mw'
raw_data_id = preprocessor.raw_data_id
version_number = 1
model_name = f"{model_architecture}_{raw_data_id}_v{version_number}"

# Define/get config details
rnn_timestamp = model.timestamp

# Create the configuration dictionary
model_config = create_config_dict(
    model_name=f"{rnn_timestamp}_{model_name}",
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    num_epochs=num_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    raw_data_path=ff_mw_raw_data,
    processed_data_path=processed_data_file,
    logging_level="DEBUG",
    logging_format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
)

# Save the trained model and configuration settings
model_dir = Path(f"models/{model_name}")
model_dir.mkdir(parents=True, exist_ok=True)
config_dir = Path(f"config/{model_name}")
config_dir.mkdir(parents=True, exist_ok=True)
saved_model_file, saved_config_file = data_saver.save_model_and_config(
    model,
    model_name,
    model_config,
    model_dir,
    config_dir,
)


## Old/Extra/Misc. Code Below

In [None]:
# from sklearn.metrics import f1_score
# from torch.utils.data import DataLoader, Dataset
# import torch.nn as nn
# # Initialize a new model
# input_size = X_test.shape[2]  # Make sure this is correct
# hidden_size = 64
# output_size = 2
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# new_model = rnn_model.RNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size, batch_first=True).to(device)

# # Load the model
# model_path = "models/rnn_ff-mw_v1/20231020_1701_model_2c92c2793be07eaf3765665d6287ded4_971fce5d8c82c2d1bf8db68939c8162d.pt"
# state_dict = torch.load(model_path)
# new_model.load_state_dict(state_dict)
# # loaded_model = torch.load(model_path)
# new_model.eval()  # Set the model to evaluation mode

# def evaluate_f1(model, X_test, Y_test, batch_size, device):
#     test_dataset = rnn_model.WalkDataset(X_test, Y_test)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#     model.eval()

#     # Initialize running loss & sum of squared gradients and parameters
#     running_loss = 0.0
#     correct = 0
#     total = 0

#     y_true = []
#     y_pred = []

#     with torch.no_grad():
#         for i, (inputs, labels) in enumerate(test_loader):
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             # Using CrossEntropyLoss as the loss function
#             criterion = nn.CrossEntropyLoss()
#             loss = criterion(outputs, labels)  # Compute loss
#             running_loss += loss.item()  # Accumulate loss
#             _, predicted = torch.max(outputs.data, 1)
#             total += labels.size(0)  # Accumulate total number of samples
#             correct += (predicted == labels).sum().item()
#             y_true.extend(labels.cpu().numpy().tolist())
#             y_pred.extend(predicted.cpu().numpy().tolist())

#     # Calculate average loss and accuracy over all batches
#     test_loss = running_loss / len(test_loader)
#     test_acc = correct / total

#     print(
#         f"Test Error: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

#     f1 = f1_score(y_true, y_pred)  # You can change the "average" parameter to suit your needs

#     return f1

# # Make sure you load your saved model into the variable `loaded_model`
# # Also, ensure X_test, Y_test, batch_size and device are set

# f1 = evaluate_f1(new_model, X_test, Y_test, batch_size, device)
# print(f"F1 Score: {f1}")
