# Step 2 — Labeling & Preprocessing (FD001–FD004)
Prepare cleaned datasets for modeling:
- Add RUL labels to **training** rows (RUL = max_cycle - cycle)
- Drop uninformative sensors (based on Step-1 EDA)
- Scale numeric features (fit on train, apply to test)
- Save processed CSVs in `data/processed/`

> Tweak the configuration cell below to choose dataset, sensors to keep/drop, and scaling mode.

In [2]:

# --- Make src/ importable and set project paths ---
import sys, os
from pathlib import Path

root = Path.cwd().parent  # assuming this notebook lives in notebooks/
if str(root) not in sys.path:
    sys.path.append(str(root))

DATA_RAW = root / "data" / "raw"
DATA_PROCESSED = root / "data" / "processed"
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print("Project root:", root)
print("Raw dir     :", DATA_RAW)
print("Processed dir:", DATA_PROCESSED)


Project root: /Users/jpcourneya/Documents/Projects/predictive-maintenance-cmapss
Raw dir     : /Users/jpcourneya/Documents/Projects/predictive-maintenance-cmapss/data/raw
Processed dir: /Users/jpcourneya/Documents/Projects/predictive-maintenance-cmapss/data/processed


In [3]:

# --- Imports ---
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from src.utils.cmapss import (
    load_fd_raw, describe_fd, make_train_rul_labels, select_sensors
)


## Configuration

In [4]:

# ---- Choose dataset and sensor policy ----
FD = "FD001"  # change to "FD003" or "FD004" after FD001

# Choose exactly ONE: either keep_sensors OR drop_sensors (set the other to None)
# Start with a reasonable default 'keepers' from literature; refine from your EDA notes.
keep_sensors = ["s2","s3","s4","s7","s11","s12","s13","s14"]
drop_sensors = None  # e.g., ["s1","s5","s6","s10","s16","s18","s19"]

# Scaling mode: 'global' = fit scaler on all train rows
# (Later you can explore per-engine or per-condition scaling if needed)
SCALING_MODE = "global"
RANDOM_STATE = 42


## Load raw data

In [5]:

fd = load_fd_raw(fd=FD, base_dir=str(DATA_RAW), verbose=True)
train_raw, test_raw, rul_test = fd.train, fd.test, fd.rul_test

print("Train head:")
display(train_raw.head())


Dataset FD001: 1 condition(s), 1 fault mode(s) — HPC degradation
  Train shape: (20631, 26)  | engines: 100
  Test  shape: (13096, 26)  | engines: 100
  RUL_test length: 100
Train head:


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


## Add RUL labels to training data

In [6]:

train_lab = make_train_rul_labels(train_raw)
# Sanity: last cycle per engine should have RUL == 0
last_idx = train_lab.groupby("engine_id")["cycle"].idxmax()
assert (train_lab.loc[last_idx, "RUL"] == 0).all(), "RUL sanity check failed"
print("RUL label check passed. Train shape:", train_lab.shape)


RUL label check passed. Train shape: (20631, 27)


## Select sensors to keep/drop

In [7]:

all_sensor_cols = [c for c in train_lab.columns if c.startswith("s")]
base_cols = ["engine_id","cycle"] + [c for c in train_lab.columns if c.startswith("op_setting_")]

if keep_sensors is not None and drop_sensors is not None:
    raise ValueError("Set either keep_sensors or drop_sensors, not both.")

if keep_sensors is not None:
    sensors_final = [s for s in keep_sensors if s in all_sensor_cols]
elif drop_sensors is not None:
    sensors_final = [s for s in all_sensor_cols if s not in set(drop_sensors)]
else:
    sensors_final = all_sensor_cols  # keep all sensors

print("Using sensors:", sensors_final)

train_sel = train_lab[base_cols + sensors_final + ["RUL"]].copy()
test_sel  = test_raw[base_cols + sensors_final].copy()

print("Selected train shape:", train_sel.shape, "| test shape:", test_sel.shape)


Using sensors: ['s2', 's3', 's4', 's7', 's11', 's12', 's13', 's14']
Selected train shape: (20631, 14) | test shape: (13096, 13)


## Scale features
Fit scaler **on training set only**, then transform both train/test.

By default we scale **operational settings and sensor columns** but not identifiers (`engine_id`, `cycle`).

In [8]:

# Columns to scale (op settings + sensors)
op_cols = [c for c in train_sel.columns if c.startswith("op_setting_")]
feat_cols = op_cols + sensors_final

scaler = StandardScaler()
train_scaled = train_sel.copy()
test_scaled  = test_sel.copy()

# Fit on train features
scaler.fit(train_scaled[feat_cols])

# Transform
train_scaled[feat_cols] = scaler.transform(train_scaled[feat_cols])
test_scaled[feat_cols]  = scaler.transform(test_scaled[feat_cols])

print("Scaled columns:", feat_cols[:8], "... ({} total)".format(len(feat_cols)))
print("Train scaled shape:", train_scaled.shape, "| Test scaled shape:", test_scaled.shape)


Scaled columns: ['op_setting_1', 'op_setting_2', 'op_setting_3', 's2', 's3', 's4', 's7', 's11'] ... (11 total)
Train scaled shape: (20631, 14) | Test scaled shape: (13096, 13)


## Save processed artifacts

In [9]:

train_out = DATA_PROCESSED / f"train_{FD}.csv"
test_out  = DATA_PROCESSED / f"test_{FD}.csv"

# Save CSVs
train_scaled.to_csv(train_out, index=False)
test_scaled.to_csv(test_out, index=False)

print("Saved:")
print(" -", train_out)
print(" -", test_out)


Saved:
 - /Users/jpcourneya/Documents/Projects/predictive-maintenance-cmapss/data/processed/train_FD001.csv
 - /Users/jpcourneya/Documents/Projects/predictive-maintenance-cmapss/data/processed/test_FD001.csv


## (Optional) Quick post-scaling sanity checks

In [10]:

print("Feature means (train, approx 0):")
display(train_scaled[feat_cols].mean().head(10))
print("Feature stds (train, approx 1):")
display(train_scaled[feat_cols].std().head(10))


Feature means (train, approx 0):


op_setting_1    1.377622e-18
op_setting_2   -2.720802e-17
op_setting_3    0.000000e+00
s2              6.410348e-14
s3             -4.959437e-14
s4              9.285169e-16
s7             -3.161641e-14
s11            -2.937089e-15
s12            -1.199702e-13
s13             3.493855e-13
dtype: float64

Feature stds (train, approx 1):


op_setting_1    1.000024
op_setting_2    1.000024
op_setting_3    0.000000
s2              1.000024
s3              1.000024
s4              1.000024
s7              1.000024
s11             1.000024
s12             1.000024
s13             1.000024
dtype: float64