In [10]:
import logging
import numpy as np
import torch
import yaml
from datetime import datetime
from pathlib import Path
from src.data_preprocess.preprocessing import DataPreprocessor
from src.data_preprocess.feature_engineering import FeatureEngineer
from src.utils.utilities import prepare_train_test_sequences
from src.utils.utilities import create_config_dict
from src.utils.utilities import get_hash
from src.utils.utilities import load_train_test_data
from src.utils.utilities import handle_infinity_and_na_numpy as handle_inf_na
from src.models.rnn_model import train_rnn_model
from src.models import rnn_model
import hashlib
import pickle

# Set up logging
logging.basicConfig(level=logging.DEBUG)


In [2]:
def preprocess_data(df):
    """
    Performs preprocessing steps on the input DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame.

    Returns
    -------
    pandas.DataFrame
        The preprocessed DataFrame.
    """
    preprocessor = DataPreprocessor(df=df)
    preprocessor.drop_columns(["plot"])  # Drop the 'plot' column
    preprocessor.calculate_means([["ANTdis_1", "ANTdis_2"]], ["ANTdis"])  # Calculate the mean of 'ANTdis_1' and 'ANTdis_2' and store it in a new column 'ANTdis'
    preprocessor.add_labels(["walk_backwards", "walk_backwards"], "start_walk")  # Add a new column 'start_walk' with value 'walk_backwards' for rows where the 'walk_backwards' column has value 'walk_backwards'
    preprocessor.handle_infinity_and_na()  # Replace infinity and NaN values with appropriate values
    preprocessor.specific_rearrange(
        "F2Wdis_rate", "F2Wdis"
    )  # Rearrange the column names
    preprocessor.rearrange_columns(
        [
            "Frame",
            "Fdis",
            "FdisF",
            "FdisL",
            "Wdis",
            "WdisF",
            "WdisL",
            "Fangle",
            "Wangle",
            "F2Wdis",
            "F2Wdis_rate",
            "F2Wangle",
            "W2Fangle",
            "ANTdis",
            "F2W_blob_dis",
            "bp_F_delta",
            "bp_W_delta",
            "ap_F_delta",
            "ap_W_delta",
            "ant_W_delta",
            "file",
            "start_walk",
        ]
    )  # Rearrange the columns in a specific order
    return preprocessor.df


def engineer_features(df):
    """
    Performs feature engineering steps on the input DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame.

    Returns
    -------
    pandas.DataFrame
        The feature-engineered DataFrame.
    """
    feature_engineer = FeatureEngineer(df=df)
    feature_engineer.standardize_features(
        [
            "Fdis",
            "FdisF",
            "FdisL",
            "Wdis",
            "WdisF",
            "WdisL",
            "Fangle",
            "Wangle",
            "F2Wdis",
            "F2Wdis_rate",
            "F2Wangle",
            "W2Fangle",
            "ANTdis",
            "F2W_blob_dis",
            "bp_F_delta",
            "bp_W_delta",
            "ap_F_delta",
            "ap_W_delta",
            "ant_W_delta",
        ]
    )  # Standardize the selected features
    return feature_engineer.df


def train_model(X_train, Y_train, X_test, Y_test, input_size, hidden_size, output_size, num_epochs, batch_size, learning_rate, device, batch_first=True):
    """
    Trains an RNN model on the input data.

    Parameters
    ----------
    X_train : numpy.ndarray
        The training input sequences.
    Y_train : numpy.ndarray
        The training target sequences.
    X_test : numpy.ndarray
        The test input sequences.
    Y_test : numpy.ndarray
        The test target sequences.
    input_size : int
        The size of the input features.
    hidden_size : int
        The size of the hidden layer.
    output_size : int
        The size of the output layer.
    num_epochs : int
        The number of training epochs.
    batch_size : int
        The batch size for training.
    learning_rate : float
        The learning rate for training.
    device : torch.device
        The device to use for training.
    batch_first : bool, optional
        Whether the input sequences have the batch dimension as the first dimension.

    Returns
    -------
    torch.nn.Module
        The trained RNN model.
    """
    model = train_rnn_model(X_train, Y_train, X_test, Y_test, input_size,
                            hidden_size, output_size, num_epochs, batch_size, learning_rate, device, batch_first=batch_first)  # Train the RNN model
    return model


def save_model_and_config(model, model_name, timestamp, pickle_path, processed_data_path, config, model_dir, config_dir):
    """
    Saves the trained model and configuration settings.

    Parameters
    ----------
    model : torch.nn.Module
        The trained RNN model.
    model_name : str
        The name of the model.
    timestamp : str
        The timestamp to use in the output file names.
    pickle_path : str
        The path to the input data pickle file.
    processed_data_path : str
        The path to the processed data pickle file.
    config : dict
        The configuration settings for the model.
    model_dir : pathlib.Path
        The directory to save the trained model.
    config_dir : pathlib.Path
        The directory to save the configuration settings.

    Returns
    -------
    None
    """
    # Get the hash values of the model and configuration
    model_hash = hashlib.md5(str(model.state_dict()).encode('utf-8')).hexdigest()
    config_hash = hashlib.md5(str(config).encode('utf-8')).hexdigest()

    # Check if the model and configuration already exist
    existing_models = [f.name for f in model_dir.glob("*.pt")]
    existing_configs = [f.name for f in config_dir.glob("*.yaml")]
    if f"rnn_model_{model_hash}.pt" in existing_models and f"config_{config_hash}.yaml" in existing_configs:
        logging.info("Model and configuration already exist. Skipping saving.")
    else:
        # Save the trained model
        model_path = model_dir / \
            f"{timestamp}_model_{model_hash}_{config_hash}.pt"
        torch.save(model.state_dict(), model_path)

        # Save the configuration settings
        config_path = config_dir / f"{timestamp}_config_{config_hash}.yaml"
        with open(config_path, "w") as f:
            yaml.dump(config, f)

def save_train_test_data(X_train, Y_train, X_test, Y_test):
    """
    Saves the train and test datasets for the RNN model as .pkl files.

    Parameters
    ----------
    X_train : numpy.ndarray
        The training input sequences.
    Y_train : numpy.ndarray
        The training target values.
    X_test : numpy.ndarray
        The testing input sequences.
    Y_test : numpy.ndarray
        The testing target values.
    """
    try:
        # Create a timestamped directory for the processed data
        timestamp = datetime.now().strftime("%Y%m%d")
        dir_name = Path(f"data/processed/rnn_input/{timestamp}")
        dir_name.mkdir(parents=True, exist_ok=True)

        # Save the train and test datasets as .pkl files
        X_train_file = dir_name / "X_train.pkl"
        Y_train_file = dir_name / "Y_train.pkl"
        X_test_file = dir_name / "X_test.pkl"
        Y_test_file = dir_name / "Y_test.pkl"

        with open(X_train_file, "wb") as f:
            pickle.dump(X_train, f)
        with open(Y_train_file, "wb") as f:
            pickle.dump(Y_train, f)
        with open(X_test_file, "wb") as f:
            pickle.dump(X_test, f)
        with open(Y_test_file, "wb") as f:
            pickle.dump(Y_test, f)

        logging.info(f"Saved train and test datasets to {dir_name}.")
    except Exception as e:
        logging.error(f"Error saving train and test datasets: {e}")
        raise
    return dir_name

In [3]:
is_load = True
if is_load:
    X_train, Y_train, X_test, Y_test = load_train_test_data('../data/processed/rnn_input/')
else:
    # Initialize preprocessing object and load data
    pickle_path = "data/interim/ff-mw.pkl"
    preprocessor = DataPreprocessor(pickle_path=pickle_path)
    logging.info("Loading data...")
    df = preprocessor.load_data()

    # Perform preprocessing steps
    logging.info("Performing preprocessing steps...")
    df = preprocess_data(df)

    # Perform feature engineering steps
    logging.info("Performing feature engineering steps...")
    df = engineer_features(df)

    # Save the processed data
    logging.info("Saving processed data...")
    input_data = "ff-mw"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    processed_data_path = preprocessor.save_processed_data(input_data, timestamp)  # Save the processed data to a file

    # Prepare sequences and train-test splits
    logging.info("Preparing sequences and train-test splits...")
    X_train, Y_train, X_test, Y_test = prepare_train_test_sequences(df)

    # Save the train-test splits
    logging.info("Saving train-test splits...")
    save_train_test_data(X_train, Y_train, X_test, Y_test)

In [None]:
# Check for data imbalance in Y_train and Y_test
# Note that the single feature in Y data is a binary classification
# 0: no walk
# 1: walk
logging.info("Checking for data imbalance...")
logging.info(f"Y_train: {np.unique(Y_train, return_counts=True)}")
logging.info(f"Y_test: {np.unique(Y_test, return_counts=True)}")


In [4]:
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

num_nan_values_X_train = np.isnan(X_train).sum()
num_nan_values_X_test = np.isnan(X_test).sum()
num_nan_values_Y_train = np.isnan(Y_train).sum()
num_nan_values_Y_test = np.isnan(Y_test).sum()
print(f"Number of NaN values in train X set: {num_nan_values_X_train}")
print(f"Number of NaN values in test X set: {num_nan_values_X_test}")
print(f"Number of NaN values in train Y set: {num_nan_values_Y_train}")
print(f"Number of NaN values in test Y set: {num_nan_values_Y_test}")

num_inf_values_X_train = np.isinf(X_train).sum()
num_inf_values_X_test = np.isinf(X_test).sum()
num_inf_values_Y_train = np.isinf(Y_train).sum()
num_inf_values_Y_test = np.isinf(Y_test).sum()
print(f"Number of inf values in train X set: {num_inf_values_X_train}")
print(f"Number of inf values in test X set: {num_inf_values_X_test}")
print(f"Number of inf values in train Y set: {num_inf_values_Y_train}")
print(f"Number of inf values in test Y set: {num_inf_values_Y_test}")

(6527728, 2, 19) (3264000, 2, 19) (6527728,) (3264000,)
Number of NaN values in train X set: 119625878
Number of NaN values in test X set: 49944411
Number of NaN values in train Y set: 0
Number of NaN values in test Y set: 0
Number of inf values in train X set: 135506
Number of inf values in test X set: 116331
Number of inf values in train Y set: 0
Number of inf values in test Y set: 0


In [5]:
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0
Y_train[np.isnan(Y_train)] = 0
Y_test[np.isnan(Y_test)] = 0

X_train[np.isinf(X_train)] = 0
X_test[np.isinf(X_test)] = 0
Y_train[np.isinf(Y_train)] = 0
Y_test[np.isinf(Y_test)] = 0
# handle_inf_na(X_train, X_test)

In [6]:

# Train the RNN model
print(f"Training RNN Model...\n===============================\n")
input_size = X_train.shape[2] # - 1  ### -1 because we drop the target column
# print(f"Input size: {input_size}\n\n")
hidden_size = 64
output_size = 2
num_epochs = 10
batch_size = 512
learning_rate = 0.001
batch_first = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = train_model(X_train, Y_train, X_test, Y_test, input_size,
                    hidden_size, output_size, num_epochs, batch_size, learning_rate, device, batch_first=batch_first)


Training RNN Model...

Input size: 19


Epoch 1/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.767469  [  512/6527728]
Loss: 0.329938  [653312/6527728]
Loss: 0.341922  [1306112/6527728]
Loss: 0.320609  [1958912/6527728]
Loss: 0.318394  [2611712/6527728]
Loss: 0.326418  [3264512/6527728]
Loss: 0.332280  [3917312/6527728]
Loss: 0.319775  [4570112/6527728]
Loss: 0.317503  [5222912/6527728]
Loss: 0.317598  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 1:
	Sum of squared gradients :    2152.8122
	Sum of squared parameters:  425310.0573



Train Error: 
 Avg loss: 0.343351
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.322466 

Epoch 2/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.317434  [  512/6527728]
Loss: 0.314492  [653312/6527728]
Loss: 0.318075  [1306112/6527728]
Loss: 0.314417  [1958912/6527728]
Loss: 0.314329  [2611712/6527728]
Loss: 0.317415  [3264512/6527728]
Loss: 0.320617  [3917312/6527728]
Loss: 0.318167  [4570112/6527728]
Loss: 0.314764  [5222912/6527728]
Loss: 0.315246  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 2:
	Sum of squared gradients :      21.0592
	Sum of squared parameters:  438541.4672



Train Error: 
 Avg loss: 0.319433
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.318319 

Epoch 3/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.314840  [  512/6527728]
Loss: 0.313823  [653312/6527728]
Loss: 0.315504  [1306112/6527728]
Loss: 0.313783  [1958912/6527728]
Loss: 0.313778  [2611712/6527728]
Loss: 0.315514  [3264512/6527728]
Loss: 0.317493  [3917312/6527728]
Loss: 0.317741  [4570112/6527728]
Loss: 0.314067  [5222912/6527728]
Loss: 0.314475  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 3:
	Sum of squared gradients :       7.9613
	Sum of squared parameters:  446251.3208



Train Error: 
 Avg loss: 0.316976
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.316845 

Epoch 4/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.314140  [  512/6527728]
Loss: 0.313603  [653312/6527728]
Loss: 0.314628  [1306112/6527728]
Loss: 0.313594  [1958912/6527728]
Loss: 0.313596  [2611712/6527728]
Loss: 0.314762  [3264512/6527728]
Loss: 0.316155  [3917312/6527728]
Loss: 0.317569  [4570112/6527728]
Loss: 0.313802  [5222912/6527728]
Loss: 0.314126  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 4:
	Sum of squared gradients :       3.7313
	Sum of squared parameters:  451690.9150



Train Error: 
 Avg loss: 0.315992
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.316121 

Epoch 5/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313854  [  512/6527728]
Loss: 0.313510  [653312/6527728]
Loss: 0.314224  [1306112/6527728]
Loss: 0.313504  [1958912/6527728]
Loss: 0.313505  [2611712/6527728]
Loss: 0.314366  [3264512/6527728]
Loss: 0.315424  [3917312/6527728]
Loss: 0.317474  [4570112/6527728]
Loss: 0.313660  [5222912/6527728]
Loss: 0.313926  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 5:
	Sum of squared gradients :       2.3789
	Sum of squared parameters:  455903.4555



Train Error: 
 Avg loss: 0.315472
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.315689 

Epoch 6/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313700  [  512/6527728]
Loss: 0.313455  [653312/6527728]
Loss: 0.313993  [1306112/6527728]
Loss: 0.313451  [1958912/6527728]
Loss: 0.313452  [2611712/6527728]
Loss: 0.314126  [3264512/6527728]
Loss: 0.314971  [3917312/6527728]
Loss: 0.317414  [4570112/6527728]
Loss: 0.313574  [5222912/6527728]
Loss: 0.313797  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 6:
	Sum of squared gradients :       1.8266
	Sum of squared parameters:  459331.9087



Train Error: 
 Avg loss: 0.315141
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.315367 

Epoch 7/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313606  [  512/6527728]
Loss: 0.313420  [653312/6527728]
Loss: 0.313846  [1306112/6527728]
Loss: 0.313417  [1958912/6527728]
Loss: 0.313417  [2611712/6527728]
Loss: 0.313967  [3264512/6527728]
Loss: 0.314667  [3917312/6527728]
Loss: 0.317374  [4570112/6527728]
Loss: 0.313518  [5222912/6527728]
Loss: 0.313709  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 7:
	Sum of squared gradients :       1.2419
	Sum of squared parameters:  462186.9147



Train Error: 
 Avg loss: 0.314909
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.315168 

Epoch 8/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313544  [  512/6527728]
Loss: 0.313396  [653312/6527728]
Loss: 0.313747  [1306112/6527728]
Loss: 0.313393  [1958912/6527728]
Loss: 0.313393  [2611712/6527728]
Loss: 0.313855  [3264512/6527728]
Loss: 0.314450  [3917312/6527728]
Loss: 0.317344  [4570112/6527728]
Loss: 0.313478  [5222912/6527728]
Loss: 0.313645  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 8:
	Sum of squared gradients :       0.8808
	Sum of squared parameters:  464643.0558



Train Error: 
 Avg loss: 0.314760
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.315028 

Epoch 9/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313500  [  512/6527728]
Loss: 0.313378  [653312/6527728]
Loss: 0.313674  [1306112/6527728]
Loss: 0.313376  [1958912/6527728]
Loss: 0.313375  [2611712/6527728]
Loss: 0.313772  [3264512/6527728]
Loss: 0.314288  [3917312/6527728]
Loss: 0.317322  [4570112/6527728]
Loss: 0.313449  [5222912/6527728]
Loss: 0.313597  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 9:
	Sum of squared gradients :       0.6693
	Sum of squared parameters:  466808.4121



Train Error: 
 Avg loss: 0.314650
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.314921 

Epoch 10/10
-------------------------------
Print interval: 1275
Number of batches: 12750
Batch size: 512
Loss: 0.313467  [  512/6527728]
Loss: 0.313365  [653312/6527728]
Loss: 0.313620  [1306112/6527728]
Loss: 0.313363  [1958912/6527728]
Loss: 0.313362  [2611712/6527728]
Loss: 0.313709  [3264512/6527728]
Loss: 0.314162  [3917312/6527728]
Loss: 0.317305  [4570112/6527728]
Loss: 0.313426  [5222912/6527728]
Loss: 0.313558  [5875712/6527728]


INFO:root:Sum squared grads/params in Epoch 10:
	Sum of squared gradients :       0.5253
	Sum of squared parameters:  468744.1679



Train Error: 
 Avg loss: 0.314566
Test Error: 
 Accuracy: 99.9%, Avg loss: 0.314837 



NameError: name 'timestamp' is not defined

In [9]:
pickle_path = "data/interim/ff-mw.pkl"
processed_data_path = "data/processed/rnn_input/"
# Create the model name
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
model_architecture = "rnn"
input_data = "ff-mw"
version_number = 1
model_name = f"{model_architecture}_{input_data}_v{version_number}"

# Create the configuration dictionary
config = create_config_dict(
    model_name=f"{timestamp}_{model_name}",
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    num_epochs=num_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    raw_data_path=None,
    interim_data_path=pickle_path,
    processed_data_path=processed_data_path,
    logging_level='DEBUG',
    logging_format='%(asctime)s - %(levelname)s - %(module)s - %(message)s'
)  # Create a dictionary with configuration settings

# Save the trained model and configuration settings
model_dir = Path(f"models/{model_name}")
model_dir.mkdir(parents=True, exist_ok=True)

config_dir = Path(f"config/{model_name}")
config_dir.mkdir(parents=True, exist_ok=True)

save_model_and_config(model, model_name, timestamp, pickle_path, processed_data_path, config, model_dir, config_dir)


In [14]:
# from sklearn.metrics import f1_score
# from torch.utils.data import DataLoader, Dataset
# import torch.nn as nn
# # Initialize a new model
# input_size = X_test.shape[2]  # Make sure this is correct
# hidden_size = 64
# output_size = 2
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# new_model = rnn_model.RNN(input_size=input_size, hidden_size=hidden_size, output_size=output_size, batch_first=True).to(device)

# # Load the model
# model_path = "models/rnn_ff-mw_v1/20231020_1701_model_2c92c2793be07eaf3765665d6287ded4_971fce5d8c82c2d1bf8db68939c8162d.pt"
# state_dict = torch.load(model_path)
# new_model.load_state_dict(state_dict)
# # loaded_model = torch.load(model_path)
# new_model.eval()  # Set the model to evaluation mode

# def evaluate_f1(model, X_test, Y_test, batch_size, device):
#     test_dataset = rnn_model.WalkDataset(X_test, Y_test)
#     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#     model.eval()

#     # Initialize running loss & sum of squared gradients and parameters
#     running_loss = 0.0
#     correct = 0
#     total = 0
    
#     y_true = []
#     y_pred = []

#     with torch.no_grad():
#         for i, (inputs, labels) in enumerate(test_loader):
#             inputs, labels = inputs.to(device), labels.to(device)
#             outputs = model(inputs)
#             # Using CrossEntropyLoss as the loss function
#             criterion = nn.CrossEntropyLoss()
#             loss = criterion(outputs, labels)  # Compute loss
#             running_loss += loss.item()  # Accumulate loss
#             _, predicted = torch.max(outputs.data, 1)
#             total += labels.size(0)  # Accumulate total number of samples
#             correct += (predicted == labels).sum().item()
#             y_true.extend(labels.cpu().numpy().tolist())
#             y_pred.extend(predicted.cpu().numpy().tolist())
            
#     # Calculate average loss and accuracy over all batches
#     test_loss = running_loss / len(test_loader)
#     test_acc = correct / total

#     print(
#         f"Test Error: \n Accuracy: {(100*test_acc):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    
#     f1 = f1_score(y_true, y_pred)  # You can change the "average" parameter to suit your needs

#     return f1

# # Make sure you load your saved model into the variable `loaded_model`
# # Also, ensure X_test, Y_test, batch_size and device are set

# f1 = evaluate_f1(new_model, X_test, Y_test, batch_size, device)
# print(f"F1 Score: {f1}")

Test Error: 
 Accuracy: 99.9%, Avg loss: 0.314837 

F1 Score: 0.0
