# TBL Model Development: Random Forest Regression

In [1]:
# imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from joblib import Parallel, delayed


In [2]:
# Processing Data Function (switching to using radius)

def read_and_process_data(directory_path):
    data_frames = []
    columns_to_extract = ['radius_X', 'radius_Y', 'radius_Z', 'radius_Ox', 'radius_Oy', 'radius_Oz', 'Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz']

    # Assuming each cycle has exactly 356 data points
    total_data_points = 356

    for file_name in os.listdir(directory_path):
        if file_name.endswith(".csv"):
            file_path = os.path.join(directory_path, file_name)
            participant = int(os.path.basename(file_path).split('_')[0])
            cycle_id = os.path.basename(file_path).split('_')[1].split('.')[0]  # Extract cycle_id
            intensity = cycle_id[:4]  # Extract the first four characters of cycle_id as intensity

            # Read data from CSV and select only the desired columns
            df = pd.read_csv(file_path, usecols=columns_to_extract)

            # Add participant ID, cycle_id, and participant_cycle_id as features
            df['Participant'] = participant
            df['Cycle_ID'] = cycle_id
            df['Participant_Cycle_ID'] = f"{participant}_{cycle_id}"
            #df['Intensity'] = intensity

            # Since Data will get shuffled (idk why but shuffling makes the model so much better.. ??)
            # So thus, need to store original index values 
            df['Original_Index'] = df.index

            # # Add normalized_cycle_position
            # df['Normalized_Cycle_Position'] = df.index / (total_data_points - 1)

            if (intensity == "HIIT"):
                df['Intensity'] = .90
            else:
                df['Intensity'] = .50
            
            # df['Intensity'] = intensity  # this is either "HIIT" or "MICT"

            data_frames.append(df)

    # Concatenate all data frames
    processed_data = pd.concat(data_frames, ignore_index=True)

    # Merge with participant weights
    weights_df = pd.read_csv("Participant Weights.csv")
    weights_df['Weight'] = weights_df['Weight'].astype(float)
    weights_df['Wingspan'] = weights_df['Wingspan'].astype(float)
    processed_data = pd.merge(processed_data, weights_df, left_on='Participant', right_on='Participant')

    return processed_data


In [3]:
# Set up paths
data_directory = "Processed Data for ML"
output_folder = "Train and Test Data (NEW 6)" # no shuffling for trial 3

# Read and process data
data = read_and_process_data(data_directory)

# Get unique participants
participants = data['Participant'].unique()

# Changing the order of the participants (now we're sorting it from 1-21)
participants = np.sort(participants)

# Check that we have exactly 20 participants
assert len(participants) == 20, "The number of participants is not equal to 20."

# Randomly choose 2 participants to leave out for testing or manually select
test_participants = np.random.choice(participants, 2, replace=False)
# test_participants = [1, 13]
# Select the participants for training
train_participants = np.setdiff1d(participants, test_participants)

print(test_participants)
print("for the train")
print(train_participants)

# Split the data into train and test based on the selected participants
train_data = data[data['Participant'].isin(train_participants)]
test_data = data[data['Participant'].isin(test_participants)]

train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Specify the output columns
output_columns = ['Fx', 'Fy', 'Fz', 'Tx', 'Ty', 'Tz']

# Create X (input) and y (output) for train and test
X_train = train_data.drop(output_columns, axis=1)
y_train = train_data[output_columns]
X_test = test_data.drop(output_columns, axis=1)
y_test = test_data[output_columns]

data_folder = os.path.join(output_folder, "Train Test Split")
os.makedirs(data_folder, exist_ok=True)

# Save train and test data to CSV
train_data.to_csv(os.path.join(data_folder, "train_data.csv"), index=False)
test_data.to_csv(os.path.join(data_folder, "test_data.csv"), index=False)

# Save participant numbers to text files
np.savetxt(os.path.join(data_folder, "train_participants.txt"), train_participants, fmt='%d')
np.savetxt(os.path.join(data_folder, "test_participant.txt"), test_participants, fmt='%d')

[15  8]
for the train
[ 1  2  3  4  5  6  9 10 11 12 13 14 16 17 18 19 20 21]


In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import gc  # Import the garbage collector module

# Define numerical features
numeric_features = ['radius_X', 'radius_Y', 'radius_Z', 'radius_Ox', 'radius_Oy', 'radius_Oz', 'Weight', 'Wingspan', 'Intensity']
#categorical_features = ['Intensity']

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor that applies transformers to specific columns 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        #('cat', categorical_transformer, categorical_features)
    ]
)

regressor = RandomForestRegressor(n_estimators=700, n_jobs=-1)

# Create a pipeline with the preprocessor and the regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', regressor)
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

best_model = pipeline

In [5]:
def calculate_mdpe_by_participant_cycle_ids(y_pred, y_test, X_test):
    """
    Calculate Median Percentage Error (MDPE) for each unique participant_cycle_id for a specific output column.
    
    Args:
    - y_pred (numpy array): Predicted values.
    - y_test (numpy array): True values.
    - X_test (DataFrame): DataFrame containing the test data including 'Participant_Cycle_ID'.
    - output_column (str): Name of the output column.
    
    Returns:
    - mdpe_scores (dict): Dictionary containing MDPE scores for each unique participant_cycle_id.
    """
    mdpe_scores = []
    
    # Get unique participant_cycle_ids
    unique_participant_cycle_ids = X_test['Participant_Cycle_ID'].unique()
    
    # Calculate MDPE for each unique participant_cycle_id
    for unique_id in unique_participant_cycle_ids:
        mask = X_test['Participant_Cycle_ID'] == unique_id
        y_pred_id = y_pred[mask]
        y_test_id = y_test[mask]
        
        # Exclude NaN values
        mask_valid = ~np.isnan(y_test_id)
        y_pred_id = y_pred_id[mask_valid]
        y_test_id = y_test_id[mask_valid]
        
        mdpe = np.median((y_pred_id - y_test_id) / y_test_id * 100)
        mdpe_scores.append(mdpe)
        
    return mdpe_scores

In [6]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

output_directory = "RF Outputs NEW 6"
os.makedirs(output_directory, exist_ok=True)

# Predict y_pred for all output columns in this fold
y_pred = best_model.predict(X_test)

# Convert the NumPy array to a pandas DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=output_columns)

# Save selected columns from X_test, y_test, and y_pred for this fold as CSV
fold_output_directory = os.path.join(output_directory, "output")
os.makedirs(fold_output_directory, exist_ok=True)

# Save train and test data to CSV
X_test.to_csv(os.path.join(fold_output_directory, "X_test.csv"), index=False)
y_test.to_csv(os.path.join(fold_output_directory, "y_test.csv"), index=False)
y_pred_df.to_csv(os.path.join(fold_output_directory, "y_pred.csv"), index=False)

# Calculate MDPE scores for this fold and each output column
mdpe_scores_list = []
for i, output_col in enumerate(output_columns):
    y_pred_fold_output = y_pred[:, i]
    
    mdpe_scores = calculate_mdpe_by_participant_cycle_ids(y_pred_fold_output, y_test[output_col], X_test)
    mdpe_scores_list.append(pd.DataFrame({'Output': [output_col]*len(mdpe_scores), 'MDPE': mdpe_scores}))
    
    average = np.mean(mdpe_scores)
    std = np.std(mdpe_scores)
    print(f"Output Column: {output_col}")
    print("Average of MdPEs:", average)
    print("Standard Deviation of MdPEs:", std)
    
    # Save average and standard deviation of MdPEs to the output directory
    with open(os.path.join(output_directory, "mdpe_scores.txt"), 'a') as f:
        f.write(f"Output Column: {output_col}\n")
        f.write(f"Average of MdPEs: {average}\n")
        f.write(f"Standard Deviation of MdPEs: {std}\n\n")

# Combine MDPE scores for all output columns in this fold
mdpe_df = pd.concat(mdpe_scores_list)

# Plot MDPE scores for this fold and all output columns
plt.figure(figsize=(12, 8))
sns.boxplot(x='Output', y='MDPE', data=mdpe_df, palette='husl')  # Using 'husl' palette for more colorful plots
plt.title(f'Box-and-Whisker Plot of MDPE Scores')
plt.ylabel('MDPE')
plt.xlabel('Output Column')
plt.xticks(rotation=45)
plt.savefig(os.path.join(fold_output_directory, f"MDPE_plot.png"))
plt.close()

Output Column: Fx
Average of MdPEs: -23.129211019672848
Standard Deviation of MdPEs: 25.508001382611635
Output Column: Fy
Average of MdPEs: -15.949662936969176
Standard Deviation of MdPEs: 26.99036200872597
Output Column: Fz
Average of MdPEs: -54.81425165380324
Standard Deviation of MdPEs: 45.96581003868795
Output Column: Tx
Average of MdPEs: -4.617479602487032
Standard Deviation of MdPEs: 30.79975880710562
Output Column: Ty
Average of MdPEs: -14.624375991782054
Standard Deviation of MdPEs: 30.392853433109703
Output Column: Tz
Average of MdPEs: -14.443587746510715
Standard Deviation of MdPEs: 29.803173300465087


In [7]:
print("\nFeature importances:")

# Extract preprocessor and regressor from the pipeline
preprocessor = best_model.named_steps['preprocessor']
regressor = best_model.named_steps['regressor']

# Get feature names after preprocessing
feature_names = preprocessor.transformers_[0][2]  # Numeric feature names

# Get feature importances
feature_importances = regressor.feature_importances_

# Sort indices by importance
sorted_indices = np.argsort(feature_importances)[::-1]

# Print feature importances
for idx in sorted_indices:
    print(f"Feature {feature_names[idx]}: {feature_importances[idx]}")


Feature importances:


Feature radius_Y: 0.4117619593867028
Feature radius_Z: 0.1701532375007475
Feature radius_X: 0.11841119300457034
Feature Wingspan: 0.06829310969225834
Feature radius_Oz: 0.06268375719089815
Feature radius_Oy: 0.05887355198231738
Feature Weight: 0.05463558794285864
Feature Intensity: 0.03437848275729298
Feature radius_Ox: 0.020809120542354013
