# Feature Engineering

In [None]:
## Setup
import sys
from pathlib import Path

# make sure parent folder (project root) is on the path
project_root = Path.cwd().parent  # adjust if your notebook lives somewhere else
sys.path.insert(0, str(project_root))

from src.preprocess_data import (
    load_cycles,
    build_summary_features,
    build_fft_features,
    assemble_features,
    save_features
)

cycles = load_cycles("../data/processed/hydraulic_cycles.csv")


## 1. Summary-stat Features

In [2]:
summary_df = build_summary_features(cycles)
print(summary_df.shape)
summary_df.head()

(2205, 51)


Unnamed: 0,PS1_mean,PS1_std,PS1_slope,PS2_mean,PS2_std,PS2_slope,PS3_mean,PS3_std,PS3_slope,PS4_mean,...,VS1_slope,CE_mean,CE_std,CE_slope,CP_mean,CP_std,CP_slope,SE_mean,SE_std,SE_slope
0,160.673492,13.938147,-0.004043,109.466914,47.110581,0.017231,1.991475,0.945626,0.000281,0.0,...,-0.000618,39.60135,6.317224,-0.359653,1.86275,0.277047,-0.015649,59.157183,23.565119,0.807209
1,160.60332,14.117791,-0.004177,109.35489,47.04169,0.017187,1.976234,0.941889,0.000278,0.0,...,-0.000858,25.786433,1.672019,-0.094054,1.25555,0.073981,-0.004121,59.335617,23.658267,0.822355
2,160.34772,14.191436,-0.004211,109.158845,46.988144,0.017162,1.972224,0.943422,0.000279,0.0,...,-0.001313,22.218233,0.633003,-0.032079,1.113217,0.023069,-0.000819,59.54315,23.723181,0.820881
3,160.188088,14.226617,-0.004222,109.064807,46.968307,0.017157,1.946575,0.935456,0.000268,0.0,...,-0.000875,20.459817,0.451941,-0.013245,1.06215,0.024415,0.000114,59.7949,23.821972,0.817013
4,160.000472,14.275244,-0.004249,108.931434,46.87104,0.017109,1.922707,0.930258,0.000268,0.0,...,-0.001028,19.787017,0.287728,-0.002295,1.070467,0.021297,0.000646,59.455267,23.771653,0.835986


## 3. FFT Features

In [3]:
fft_df = build_fft_features(cycles, n_bins=5)
print(fft_df.shape)
fft_df.head()

(2205, 85)


Unnamed: 0,PS1_fft_1,PS1_fft_2,PS1_fft_3,PS1_fft_4,PS1_fft_5,PS2_fft_1,PS2_fft_2,PS2_fft_3,PS2_fft_4,PS2_fft_5,...,CP_fft_1,CP_fft_2,CP_fft_3,CP_fft_4,CP_fft_5,SE_fft_1,SE_fft_2,SE_fft_3,SE_fft_4,SE_fft_5
0,13390.23064,41124.954872,25006.857544,22983.941228,7101.881116,133014.553,88393.857199,79060.693102,50603.331167,36050.71699,...,10.014761,3.78146,2.564793,2.043909,1.556604,549.325209,461.093339,414.805432,340.830538,245.353986
1,14186.784491,41688.033643,25491.721727,23255.265434,6931.964505,132775.540316,88207.167898,78974.853884,50624.110915,36093.679734,...,2.528716,0.813515,0.875792,0.782996,0.458064,562.590366,465.026661,416.573933,337.21218,240.83773
2,14315.995606,41976.548343,25502.637064,23362.982574,6894.654941,132666.907088,88009.207379,78979.336646,50466.582155,36073.469207,...,0.65167,0.37354,0.393021,0.118828,0.175458,560.457045,467.494335,414.941344,341.463301,240.2198
3,14366.954594,42046.83027,25568.20552,23437.302458,6948.154506,132661.420154,87909.709549,78956.780388,50412.566878,36032.463684,...,0.726212,0.537487,0.213082,0.107253,0.07654,557.762659,464.853532,422.323211,345.164357,245.77364
4,14533.768205,42147.319629,25634.935384,23512.643376,6961.611369,132358.372652,87740.449043,78844.540691,50350.222135,36033.855296,...,0.353056,0.697451,0.26091,0.055567,0.024701,575.006044,473.228523,423.250143,332.955534,230.49317


## Assemble & Save

In [4]:
# combine summary + FFT + targets
features = assemble_features(cycles, include_summary=True, include_fft=True, fft_bins=5)
print(features.shape)
save_features(features, "../data/processed/features.csv")

(2205, 140)


## extract test data for dash app

In [5]:
"""
Script: extract_dash_test_data.py

Randomly samples 10% of rows from features.csv, writes out test data and expected values,
then removes those rows from the original to produce a training set.
Usage:
    python src/extract_dash_test_data.py
"""
import pandas as pd
from pathlib import Path

# Determine project root reliably
try:
    # when run as script
    BASE_DIR = Path(__file__).parent.parent.resolve()
except NameError:
    # when run interactively (e.g., notebook)
    BASE_DIR = Path().resolve().parent

# Data directory and file paths
data_dir      = BASE_DIR / 'data' / 'processed'
FEATURES_CSV  = data_dir / 'features.csv'
TEST_DATA_CSV = data_dir / 'dash_test_data.csv'
TEST_EXPECTED = data_dir / 'dash_test_expected_values.csv'
TRAIN_CSV     = data_dir / 'features.csv'

# Ensure data directory exists
data_dir.mkdir(parents=True, exist_ok=True)

# 1. Load full features table
if not FEATURES_CSV.exists():
    raise FileNotFoundError(f"Cannot find features.csv at {FEATURES_CSV}")

df = pd.read_csv(FEATURES_CSV)

# 2. Define target columns
target_cols = ["cooler_pct", "valve_pct", "pump_leak", "acc_pressure"]

# 3. Sample 10% of rows (reproducible)
test_df = df.sample(frac=0.1, random_state=42)

# 4. Split into feature-only and expected-value DataFrames
test_data     = test_df.drop(columns=target_cols)
test_expected = test_df[target_cols]

# 5. Remove sampled rows from original to get training set
train_df = df.drop(test_df.index)

# 6. Save CSVs
test_data.to_csv(TEST_DATA_CSV, index=False)
test_expected.to_csv(TEST_EXPECTED, index=False)
train_df.to_csv(TRAIN_CSV, index=False)

print(f"Saved {len(test_data)} test rows to {TEST_DATA_CSV}")
print(f"Saved expected targets to {TEST_EXPECTED}")
print(f"Saved {len(train_df)} training rows to {TRAIN_CSV}")


Saved 220 test rows to C:\Users\jerem\OneDrive\Desktop\HydraulicHealth-Monitoring\data\processed\dash_test_data.csv
Saved expected targets to C:\Users\jerem\OneDrive\Desktop\HydraulicHealth-Monitoring\data\processed\dash_test_expected_values.csv
Saved 1985 training rows to C:\Users\jerem\OneDrive\Desktop\HydraulicHealth-Monitoring\data\processed\features.csv


In [7]:
import pandas as pd

# 1) Load the full merged cycles (with profile)
df = pd.read_csv("../data/processed/hydraulic_cycles.csv")

# 2) Sample 3 random cycles (fixed seed for reproducibility)
test_cycles = df.sample(n=1, random_state=42)

# 3) Drop the profile/label columns
profile_cols = ["cooler_pct", "valve_pct", "pump_leak", "acc_pressure", "stable_flag"]
test_cycles_no_profile = test_cycles.drop(columns=profile_cols)

# 4) Save to CSV for your Dash app
out_path = "../data/processed/hydraulic_cycle_test.csv"
test_cycles_no_profile.to_csv(out_path, index=False)

print(f"Saved {len(test_cycles_no_profile)} test cycles (no profile) to {out_path}")

Saved 1 test cycles (no profile) to ../data/processed/hydraulic_cycle_test.csv
