## **Step 1: Code to Load and Featurize Both Datasets**
First, we need a function to load the raw accelerometer data and another to extract features using tsfel. We'll apply this to the UCI training data and your custom test data.

In [6]:
import pandas as pd
import numpy as np
import os
import tsfel
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# --- Configuration ---
# Set the base paths for your custom data and the UCI dataset.
MY_DATA_PATH = 'data/myData'
UCI_DATA_PATH = 'data/UCI_data/train'

# --- Data Loading Functions ---

def load_uci_raw_data(base_path):
    """
    Loads raw inertial signals and labels from the UCI HAR dataset structure.
    """
    signals_path = os.path.join(base_path, 'Inertial Signals/')
    signal_files = ['total_acc_x_train.txt', 'total_acc_y_train.txt', 'total_acc_z_train.txt']

    try:
        y = pd.read_csv(os.path.join(base_path, 'y_train.txt'), header=None).iloc[:, 0]
        signals = [pd.read_csv(os.path.join(signals_path, f), header=None, delim_whitespace=True) for f in signal_files]
    except FileNotFoundError as e:
        print(f"Error: Could not find UCI data file - {e}.")
        return [], None

    data_list = []
    for i in range(len(signals[0])):
        df = pd.DataFrame({
            'ax': signals[0].iloc[i],
            'ay': signals[1].iloc[i],
            'az': signals[2].iloc[i]
        })
        data_list.append(df)
            
    return data_list, y

def load_my_raw_data_custom(base_path):
    """
    Loads and parses custom accelerometer data from a flat directory
    based on a '{participant}_{activity}.csv' naming convention.
    """
    data_list = []
    labels_list = []
    
    activity_map = {
        'sit': 'SITTING',
        'sleep': 'LAYING',
        'stand': 'STANDING',
        'walk': 'WALKING',
        'walkd': 'WALKING_DOWNSTAIRS',
        'walku': 'WALKING_UPSTAIRS'
    }
    
    if not os.path.exists(base_path):
        print(f"Error: The directory '{base_path}' does not exist.")
        return [], None
        
    for file_name in os.listdir(base_path):
        if file_name.endswith('.csv'):
            try:
                activity_key = file_name.split('_')[1].split('.')[0]
                label = activity_map.get(activity_key)
                
                if label:
                    file_path = os.path.join(base_path, file_name)
                    df = pd.read_csv(file_path, usecols=['ax', 'ay', 'az'])
                    data_list.append(df)
                    labels_list.append(label)
            except IndexError:
                # This handles files that don't match the expected naming pattern.
                pass

    if not data_list:
        print("Error: No valid CSV data could be loaded. Verify file names and content.")

    return data_list, pd.Series(labels_list)

# --- Feature Engineering ---

def extract_tsfel_features(data_list):
    """
    Extracts a comprehensive set of time-series features using TSFEL.
    Runs silently without progress bars.
    """
    if not data_list:
        return pd.DataFrame()
    
    cfg = tsfel.get_features_by_domain()
    features_list = [tsfel.time_series_features_extractor(cfg, df, fs=50, verbose=0) for df in data_list]
    return pd.concat(features_list, ignore_index=True)

# --- Main Execution Block ---

def main():
    """
    Main function to execute the data loading, feature extraction,
    model training, and evaluation pipeline.
    """
    # 1. Process UCI Training Data
    print("Processing UCI HAR dataset...")
    uci_raw_list, y_train_uci_raw = load_uci_raw_data(UCI_DATA_PATH)
    if not uci_raw_list:
        print("Aborting: UCI data loading failed.")
        return

    X_train_uci_tsfel = extract_tsfel_features(uci_raw_list)
    uci_activity_labels = {
        1: 'WALKING', 2: 'WALKING_UPSTAIRS', 3: 'WALKING_DOWNSTAIRS', 
        4: 'SITTING', 5: 'STANDING', 6: 'LAYING'
    }
    y_train_uci_tsfel = y_train_uci_raw.map(uci_activity_labels)
    print(f"UCI data processed. Feature matrix shape: {X_train_uci_tsfel.shape}")

    # 2. Process Custom Test Data
    print("\nProcessing custom dataset...")
    my_raw_list, y_test_my_data = load_my_raw_data_custom(MY_DATA_PATH)
    if not my_raw_list:
        print("Aborting: Custom data loading failed.")
        return

    X_test_my_data_tsfel = extract_tsfel_features(my_raw_list)
    print(f"Custom data processed. Feature matrix shape: {X_test_my_data_tsfel.shape}")

    # 3. Align Features, Train Model, and Predict
    print("\nTraining model and performing prediction...")
    
    # Sanitize column names for model compatibility.
    X_train_uci_tsfel.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_train_uci_tsfel.columns]
    X_test_my_data_tsfel.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X_test_my_data_tsfel.columns]
    
    # Use only the features present in both datasets.
    common_features = X_train_uci_tsfel.columns.intersection(X_test_my_data_tsfel.columns)
    X_train = X_train_uci_tsfel[common_features]
    X_test = X_test_my_data_tsfel[common_features]

    model = DecisionTreeClassifier(max_depth=8, random_state=42)
    model.fit(X_train, y_train_uci_tsfel)

    y_pred = model.predict(X_test)

    # 4. Evaluate and Report Performance
    print("\n--- Model Performance on Custom Data ---")
    accuracy = accuracy_score(y_test_my_data, y_pred)
    report = classification_report(y_test_my_data, y_pred, zero_division=0)
    
    all_labels = sorted(list(set(y_test_my_data) | set(y_pred)))
    cm = confusion_matrix(y_test_my_data, y_pred, labels=all_labels)

    print(f"Accuracy: {accuracy:.4f}\n")
    print("Classification Report:")
    print(report)

    # Plot the confusion matrix for visual analysis.
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=all_labels, yticklabels=all_labels)
    plt.title('Confusion Matrix: UCI-Trained (TSFEL) Model on Your Data')
    plt.ylabel('Actual (Your Data)')
    plt.xlabel('Predicted by Model')
    plt.show()

if __name__ == '__main__':
    main()

Processing UCI HAR dataset...


  signals = [pd.read_csv(os.path.join(signals_path, f), header=None, delim_whitespace=True) for f in signal_files]
  signals = [pd.read_csv(os.path.join(signals_path, f), header=None, delim_whitespace=True) for f in signal_files]
  signals = [pd.read_csv(os.path.join(signals_path, f), header=None, delim_whitespace=True) for f in signal_files]


UCI data processed. Feature matrix shape: (7352, 468)

Processing custom dataset...


ValueError: Usecols do not match columns, columns expected but not found: ['ax', 'ay', 'az']

## **Step 2: Code to Train the Model and Predict**
Now that both datasets have the same TSFEL features, we can train on the UCI data and test on your data

In [None]:
# --- Train and Evaluate the Model ---

# Ensure both dataframes have the same columns
common_features = X_train_uci_tsfel.columns.intersection(X_test_my_data_tsfel.columns)
X_train = X_train_uci_tsfel[common_features]
X_test = X_test_my_data_tsfel[common_features]

# Train the Decision Tree classifier
model = DecisionTreeClassifier(max_depth=8, random_state=42)
model.fit(X_train, y_train_uci_tsfel)

# Predict on your featurized data
y_pred = model.predict(X_test)

# --- Report the Results ---
accuracy = accuracy_score(y_test_my_data, y_pred)
report = classification_report(y_test_my_data, y_pred)
cm = confusion_matrix(y_test_my_data, y_pred, labels=model.classes_)

print("\n--- TSFEL Model Performance on Your Data ---")
print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(report)

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=model.classes_, yticklabels=model.classes_)
plt.title('Confusion Matrix: UCI-Trained (TSFEL) Model on Your Data')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

### **Why Not Authors features** ?

In [None]:
# --- Load Author-Provided UCI Feature Data ---
X_train_author = pd.read_csv('data/UCI_data/train/X_train.txt', header=None, delim_whitespace=True)
y_train_author_raw = pd.read_csv('data/UCI_data/train/y_train.txt', header=None, squeeze=True)
y_train_author = y_train_author_raw.map(activity_labels)

# Train the model
author_model = DecisionTreeClassifier(max_depth=8, random_state=42)
author_model.fit(X_train_author, y_train_author)

print("Model trained on author-provided UCI features.")