# RNN Exploration & Analysis
**Author: Jibran**<br>
**Date: 2023-11-01**

## Imports & Setup

In [3]:
import logging
from pathlib import Path

import numpy as np
import torch
import yaml

import config as cpy
from src.data_preprocess.data_loader import DataLoader
from src.data_preprocess.data_saver import DataSaver
from src.data_preprocess.preprocessing import DataPreprocessor
from src.data_preprocess.rnn_data_prep import RNNDataPrep
from src.models import rnn_model
from src.models.train_eval import train_eval_model
from src.utils.utilities import create_config_dict
from src.visualization.rnn_visualize import plot_predicted_probabilities


2023-11-15 23:36:45.875924: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Set up logging

In [4]:
"""
The logging level can be set to one of the following:
DEBUG - Detailed information, typically of interest only when diagnosing problems.
INFO - Confirmation that things are working as expected.
WARNING - An indication that something unexpected happened, or indicative of some problem in the near future
(e.g. ‘disk space low’). The software is still working as expected.
ERROR - Due to a more serious problem, the software has not been able to perform some function.
CRITICAL - A serious error, indicating that the program itself may be unable to continue running.
"""
logging.basicConfig(
    level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s\n"
)


## Process Data and/or Get Train/Test Splits

### Get/set relevant configuration stuff

In [5]:
# ******************************************************************** #
#             GET POTENTIALLY RELEVANT PATHS FROM CONFIG.PY            #
# ******************************************************************** #
(
    PROJECT_ROOT,
    ff_mw_raw_data,
    processed_data_dir,
    rnn_input_data_dir,
    models_dir,
    configs_dir,
    logs_dir,
) = (
    cpy.PROJECT_ROOT,
    cpy.FF_MW_DATA_FILE,
    cpy.PROCESSED_DATA_DIR,
    cpy.RNN_INPUT_DATA_DIR,
    cpy.MODELS_DIR,
    cpy.CONFIGS_DIR,
    cpy.LOGS_DIR,
)

# ******************************************************************** #
#                     LOAD YAML CONFIGURATION FILE                     #
# ******************************************************************** #
yaml_path = Path(PROJECT_ROOT, "config.yaml")
with open(yaml_path, "r") as file:
    config = yaml.safe_load(file)

# ******************************************************************** #
#                   DETERMINE LOADING/SAVING OPTIONS                   #
# ******************************************************************** #
is_save_processed = True  # Save processed data to file
is_save_train_test = True  # Save train/test data to file
is_save_model_and_config = False  # Save model and config to file
# -------------------------------------------------------------------- #
# If there is already a saved processed data file path in the config,
# use it
processed_data_file = config["data_file_paths"]["processed"]
if processed_data_file:
    use_processed_data = True
    print("Using processed data file: ", processed_data_file)
else:
    use_processed_data = False
# -------------------------------------------------------------------- #
train_test_data_dir = config["data_file_paths"]["rnn_input_dir"]
if train_test_data_dir:
    is_load_train_test_data = True
    print("Using train/test data file: ", train_test_data_dir)
else:
    is_load_train_test_data = False
# -------------------------------------------------------------------- #
model_file = config["data_file_paths"]["model"]
model_config_file = config["data_file_paths"]["model_config"]
if model_file and model_config_file:
    is_load_model_and_config = True
    print("Using model file: ", model_file)
    print("Using model config file: ", model_config_file)
else:
    is_load_model_and_config = False
is_load_model_and_config = False  # TODO: Remove this line
# -------------------------------------------------------------------- #
data_saver = DataSaver()  # Initialize DataSaver object
# -------------------------------------------------------------------- #

# ******************************************************************** #
#                           DEFINE PARAMETERS                          #
# ******************************************************************** #
seq_len = 5


Using processed data file:  /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/processed/ff-mw/20231115_processed_data_0ead4953ec32fad25a38982f3e492f1b.pkl
Using train/test data file:  /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/rnn_input/20231115


### Get preprocessed data

In [6]:
# Set data source as raw or processed
data_source_type = "processed" if use_processed_data else "raw"
print(f"Using {data_source_type} data\n\n")
data_source_path = (
    processed_data_file if use_processed_data else ff_mw_raw_data
)

preprocessor = DataPreprocessor().set_data_source(
    data_source_type, data_source_path
)
df = preprocessor.get_preprocessed_data()

# ======================= Optionally save data ======================= #
# Optionally save data
if is_save_processed and not use_processed_data:
    print("\nSaving processed data to file...\n")
    processed_data_file = data_saver.save_processed_data(df)
    # Update the config file with the new processed data file path
    config["data_file_paths"]["processed"] = str(processed_data_file)
    with open(yaml_path, "w") as file:
        yaml.dump(config, file)


src.data_preprocess.preprocessing - INFO - Setting the data source for DataPreprocessor...


src.data_preprocess.data_loader - DEBUG - Loading processed data from /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/processed/ff-mw/20231115_processed_data_0ead4953ec32fad25a38982f3e492f1b.pkl...




Using processed data




src.data_preprocess.data_loader - DEBUG - Successfully loaded processed data from /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/processed/ff-mw/20231115_processed_data_0ead4953ec32fad25a38982f3e492f1b.pkl.


src.data_preprocess.preprocessing - INFO - Processed DataFrame loaded from file as data_source.
Use get_processed_data() to retrieve the loaded processed DataFrame.


src.data_preprocess.preprocessing - INFO - Returning processed DataFrame.




### Get train/test data splits for RNN 

In [7]:
# Set data source as raw or processed
df_processed = df if not is_load_train_test_data else None
train_test_path = train_test_data_dir if is_load_train_test_data else None
data_source_type = "df" if df_processed is not None else "file"
print(f"Using {data_source_type} data\n\n")

rnn_data_preparer = RNNDataPrep().set_data_source(
    df_processed,
    train_test_path,
)
train_test_dict, test_indices = rnn_data_preparer.get_rnn_data(seq_len)
X_train, Y_train, X_test, Y_test = (
    train_test_dict["X_train"],
    train_test_dict["Y_train"],
    train_test_dict["X_test"],
    train_test_dict["Y_test"],
)
# ======================= Optionally save data ======================= #
# Optionally save data
if is_save_train_test and not is_load_train_test_data:
    print("\nSaving train/test data to file...\n")
    train_test_data_dir = data_saver.save_train_test_data(
        X_train, Y_train, X_test, Y_test, test_indices
    )
    # Update the config file with the new train/test data file path
    config["data_file_paths"]["rnn_input_dir"] = str(train_test_data_dir)
    with open(yaml_path, "w") as file:
        yaml.dump(config, file)


src.data_preprocess.rnn_data_prep - INFO - Setting the data source for RNNDataPrep...


src.data_preprocess.data_loader - INFO - Loading train/test datasets from /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/rnn_input/20231115...




Using file data




src.data_preprocess.data_loader - INFO - Successfully loaded train/test + indices from /Users/JawanHaider/Documents/Projects/Collaborative/Dogar/flywasp-fd/data/rnn_input/20231115.


src.data_preprocess.rnn_data_prep - INFO - Train/test loaded from file as data_source.
Use get_rnn_data() without params to retrieve train/test splits.


src.data_preprocess.rnn_data_prep - INFO - Getting train-test data splits...




## Running some checks...

In [8]:
# Print shapes
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)
print("Y_test shape: ", Y_test.shape)


X_train shape:  (6527546, 5, 19)
Y_train shape:  (6527546,)
X_test shape:  (3263774, 5, 19)
Y_test shape:  (3263774,)


### Check data imbalance

In [9]:
# Check for data imbalance in Y_train and Y_test
# Note that the single feature in Y data is a binary classification
# 0: no walk
# 1: walk
logging.info("Checking for data imbalance...")
logging.info(f"Y_train: {np.unique(Y_train, return_counts=True)}")
logging.info(f"Y_test: {np.unique(Y_test, return_counts=True)}")


root - INFO - Checking for data imbalance...

root - INFO - Y_train: (array([0., 1.]), array([6523639,    3907]))

root - INFO - Y_test: (array([0., 1.]), array([3262149,    1625]))



### Check for invalid values (NaN's and Inf's)

In [10]:
# num_nan_values_X_train = np.isnan(X_train).sum()
# num_nan_values_X_test = np.isnan(X_test).sum()
# num_nan_values_Y_train = np.isnan(Y_train).sum()
# num_nan_values_Y_test = np.isnan(Y_test).sum()
# print(f"Number of NaN values in train X set: {num_nan_values_X_train}")
# print(f"Number of NaN values in test X set: {num_nan_values_X_test}")
# print(f"Number of NaN values in train Y set: {num_nan_values_Y_train}")
# print(f"Number of NaN values in test Y set: {num_nan_values_Y_test}\n")

# num_inf_values_X_train = np.isinf(X_train).sum()
# num_inf_values_X_test = np.isinf(X_test).sum()
# num_inf_values_Y_train = np.isinf(Y_train).sum()
# num_inf_values_Y_test = np.isinf(Y_test).sum()
# print(f"Number of inf values in train X set: {num_inf_values_X_train}")
# print(f"Number of inf values in test X set: {num_inf_values_X_test}")
# print(f"Number of inf values in train Y set: {num_inf_values_Y_train}")
# print(f"Number of inf values in test Y set: {num_inf_values_Y_test}")


#### Replace NaN's and Inf's with 0

In [11]:
# X_train[np.isnan(X_train)] = 0
# X_test[np.isnan(X_test)] = 0
# Y_train[np.isnan(Y_train)] = 0
# Y_test[np.isnan(Y_test)] = 0

# X_train[np.isinf(X_train)] = 0
# X_test[np.isinf(X_test)] = 0
# Y_train[np.isinf(Y_train)] = 0
# Y_test[np.isinf(Y_test)] = 0

# # handle_inf_na(X_train, X_test)


## RNN Model

### Train model

In [12]:
print(X_train.shape[1:])


(5, 19)


In [13]:
# ======================== Train the RNN model ======================= #
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# device = torch.device("cpu")
device = torch.device("mps")
batch_size = (512 * 10) if device == torch.device("mps") else 512
num_epochs = 10
hidden_size = 32
output_size = 2
learning_rate = 0.01
batch_first = True
input_size = X_train.shape[2]
kwargs = {"num_workers": 2, "pin_memory": True}  # For GPU training
print(f"Training RNN Model ({device})...\n==============================\n")
model, test_labels_and_probs = train_eval_model(
    X_train,
    Y_train,
    X_test,
    Y_test,
    input_size,
    hidden_size,
    output_size,
    num_epochs,
    batch_size,
    learning_rate,
    device,
    batch_first=batch_first,
    **kwargs,
)


Training RNN Model (mps)...



mps
Using kwargs for data loaders




src.models.train_eval - INFO - 

Model Summary:



Layer (type:depth-idx)                   Output Shape              Param #
RNN                                      [5120, 2]                 --
├─RNN: 1-1                               [5120, 5, 32]             1,696
├─Linear: 1-2                            [5120, 2]                 66
├─Sigmoid: 1-3                           [5120, 2]                 --
Total params: 1,762
Trainable params: 1,762
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 43.76
Input size (MB): 1.95
Forward/backward pass size (MB): 6.64
Params size (MB): 0.01
Estimated Total Size (MB): 8.59


Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

src.models.train_eval - INFO - Number of batches: 1275

src.models.train_eval - INFO - Batch size: 5120




Epoch 1/10
-------------------------------
Training the model...


Training...:   0%|          | 0/1275 [00:00<?, ?it/s]

Loss: 0.559216  [ 5120/6527546]
Loss: 0.520034  [655360/6527546]
Loss: 0.513737  [1305600/6527546]
Loss: 0.505976  [1955840/6527546]
Loss: 0.372829  [2606080/6527546]
Loss: 0.344420  [3256320/6527546]
Loss: 0.334151  [3906560/6527546]
Loss: 0.328972  [4556800/6527546]
Loss: 0.325858  [5207040/6527546]
Loss: 0.323779  [5857280/6527546]
Loss: 0.322290  [6507520/6527546]


src.models.train_eval - INFO - 
Sum squared grads/params in Epoch 1:
	Sum of squared gradients :     120.7810
	Sum of squared parameters:   29796.2839

src.models.train_eval - INFO - 
Train Performance: 
 Accuracy: 74.9%,Avg loss: 0.411066, F1 Score: 0.0048 


src.models.train_eval - INFO - Training Epoch 1 took 2.0 minutes.



Evaluating the model...


Testing...:   0%|          | 0/638 [00:00<?, ?it/s]

src.models.train_eval - INFO - Test Performance: 
 Accuracy: 74.9%,Avg loss: 0.390391, F1 Score: 0.4303 


src.models.train_eval - INFO - Evaluating Epoch 1 took 0.6 minutes.

src.models.train_eval - INFO - Training Data Distribution:

src.models.train_eval - INFO - {0.0: 3262149, 1.0: 1625}

src.models.train_eval - INFO - Predicted Data Distribution:

src.models.train_eval - INFO - {0.0: 2444041, 1.0: 819733}

src.models.train_eval - INFO - Epoch 1 Metrics --
Train Loss: 0.4111,
Test Loss: 0.3904,
Train Acc: 74.9%,
Test Acc: 74.9%,
Train F1 Score: 0.0048,
Test F1: 0.4303,
Test PR AUC: 0.0054



src.models.train_eval - INFO - Number of batches: 1275

src.models.train_eval - INFO - Batch size: 5120




Epoch 2/10
-------------------------------
Training the model...


Training...:   0%|          | 0/1275 [00:00<?, ?it/s]

Loss: 0.503337  [ 5120/6527546]
Loss: 0.505569  [655360/6527546]
Loss: 0.505722  [1305600/6527546]
Loss: 0.322088  [1955840/6527546]
Loss: 0.321017  [2606080/6527546]
Loss: 0.320177  [3256320/6527546]
Loss: 0.319501  [3906560/6527546]
Loss: 0.318945  [4556800/6527546]
Loss: 0.318480  [5207040/6527546]
Loss: 0.318084  [5857280/6527546]
Loss: 0.317744  [6507520/6527546]


src.models.train_eval - INFO - 
Sum squared grads/params in Epoch 2:
	Sum of squared gradients :      29.2833
	Sum of squared parameters:   32296.0430

src.models.train_eval - INFO - 
Train Performance: 
 Accuracy: 74.9%,Avg loss: 0.377916, F1 Score: 0.0048 


src.models.train_eval - INFO - Training Epoch 2 took 2.2 minutes.



Evaluating the model...


Testing...:   0%|          | 0/638 [00:00<?, ?it/s]

src.models.train_eval - INFO - Test Performance: 
 Accuracy: 74.9%,Avg loss: 0.386095, F1 Score: 0.4303 


src.models.train_eval - INFO - Evaluating Epoch 2 took 0.7 minutes.

src.models.train_eval - INFO - Training Data Distribution:

src.models.train_eval - INFO - {0.0: 3262149, 1.0: 1625}

src.models.train_eval - INFO - Predicted Data Distribution:

src.models.train_eval - INFO - {0.0: 2444017, 1.0: 819757}

src.models.train_eval - INFO - Epoch 2 Metrics --
Train Loss: 0.3779,
Test Loss: 0.3861,
Train Acc: 74.9%,
Test Acc: 74.9%,
Train F1 Score: 0.0048,
Test F1: 0.4303,
Test PR AUC: 0.0064



src.models.train_eval - INFO - Number of batches: 1275

src.models.train_eval - INFO - Batch size: 5120




Epoch 3/10
-------------------------------
Training the model...


Training...:   0%|          | 0/1275 [00:00<?, ?it/s]

Loss: 0.498656  [ 5120/6527546]
Loss: 0.506754  [655360/6527546]
Loss: 0.504570  [1305600/6527546]
Loss: 0.317959  [1955840/6527546]
Loss: 0.317630  [2606080/6527546]
Loss: 0.317344  [3256320/6527546]
Loss: 0.317094  [3906560/6527546]
Loss: 0.316872  [4556800/6527546]
Loss: 0.316675  [5207040/6527546]


### Model Evaluation, Visualization, and Analysis

In [None]:
test_indices = rnn_data_preparer.test_indices
# print(test_indices)
print(f"Test indices shape: {test_indices.shape}")


In [None]:
print(
    f"test_true_labels shape: {test_labels_and_probs[0].shape}, \ntest_pred_labels shape: {test_labels_and_probs[1].shape}, \ntest_pred_probs shape: {test_labels_and_probs[2].shape}\n"
)
print(f"df shape: {df.shape}")


In [None]:
plot_df, mean_df = plot_predicted_probabilities(
    df, test_indices, test_labels_and_probs
)


### Save model and config

In [None]:
# Create the model name
model_architecture = "rnn"
# get the raw data id, in this case 'ff-mw'
raw_data_id = preprocessor.raw_data_id
version_number = 1
model_name = f"{model_architecture}_{raw_data_id}_v{version_number}"

# Define/get config details
rnn_timestamp = model.timestamp

# Create the configuration dictionary
model_config = create_config_dict(
    model_name=f"{rnn_timestamp}_{model_name}",
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    num_epochs=num_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    raw_data_path=ff_mw_raw_data,
    processed_data_path=processed_data_file,
    logging_level="DEBUG",
    logging_format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
)

# Save the trained model and configuration settings
model_dir = Path(f"models/{model_name}")
model_dir.mkdir(parents=True, exist_ok=True)
config_dir = Path(f"config/{model_name}")
config_dir.mkdir(parents=True, exist_ok=True)
saved_model_file, saved_config_file = data_saver.save_model_and_config(
    model,
    model_name,
    model_config,
    model_dir,
    config_dir,
)


## Old/Extra/Misc. Code Below

In [None]:
# from sklearn.metrics import f1_score
# from torch.utils.data import DataLoader, Dataset
# import torch.nn as nn
# # Initialize a new model
# input_size = X_test.shape[2]  # Make sure this is correct
# hidden_size = 64
# output_size = 2
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# new_model = rnn_model.RNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size, batch_first=True).to(device)

# # Load the model
# model_path = "models/rnn_ff-mw_v1/20231020_1701_model_2c92c2793be07eaf3765665d6287ded4_971fce5d8c82c2d1bf8db68939c8162d.pt"
# state_dict = torch.load(model_path)
# new_model.load_state_dict(state_dict)
# # loaded_model = torch.load(model_path)
# new_model.eval()  # Set the model to evaluation mode

# def evaluate_f1(model, X_test, Y_test, batch_size, device):
#     test_dataset = rnn_model.WalkDataset(X_test, Y_test)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#     model.eval()

#     # Initialize running loss & sum of squared gradients and parameters
#     running_loss = 0.0
#     correct = 0
#     total = 0

#     y_true = []
#     y_pred = []

#     with torch.no_grad():
#         for i, (inputs, labels) in enumerate(test_loader):
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             # Using CrossEntropyLoss as the loss function
#             criterion = nn.CrossEntropyLoss()
#             loss = criterion(outputs, labels)  # Compute loss
#             running_loss += loss.item()  # Accumulate loss
#             _, predicted = torch.max(outputs.data, 1)
#             total += labels.size(0)  # Accumulate total number of samples
#             correct += (predicted == labels).sum().item()
#             y_true.extend(labels.cpu().numpy().tolist())
#             y_pred.extend(predicted.cpu().numpy().tolist())

#     # Calculate average loss and accuracy over all batches
#     test_loss = running_loss / len(test_loader)
#     test_acc = correct / total

#     print(
#         f"Test Error: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")

#     f1 = f1_score(y_true, y_pred)  # You can change the "average" parameter to suit your needs

#     return f1

# # Make sure you load your saved model into the variable `loaded_model`
# # Also, ensure X_test, Y_test, batch_size and device are set

# f1 = evaluate_f1(new_model, X_test, Y_test, batch_size, device)
# print(f"F1 Score: {f1}")
