In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## Import neccesary modules
import pandas as pd
import numpy as np
from sklearn.preprocessing  import StandardScaler

In [4]:
## Load the Dataset
df = pd.read_csv('/content/drive/MyDrive/PrognosAI/milestone_1/Day_1/cmapss_cleaned_train_FD001.csv')
df.head()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


# Aggregate Features – Mean, Std, Min, Max per Engine

In [5]:
# Sensor columns
sensor_cols = [col for col in df.columns if 'sensor_' in col]

In [6]:
# Engine-wise aggregate feature (static for each engine)
engine_aggs = df.groupby('engine_id')[sensor_cols].agg(['mean', 'std', 'min', 'max'])
engine_aggs.columns = ['_'.join(col) for col in engine_aggs.columns]
engine_aggs.reset_index(inplace=True)
print("Aggregate feature matrix shape: {engine_aggs.shape}")
engine_aggs.head()

Aggregate feature matrix shape: {engine_aggs.shape}


Unnamed: 0,engine_id,sensor_1_mean,sensor_1_std,sensor_1_min,sensor_1_max,sensor_2_mean,sensor_2_std,sensor_2_min,sensor_2_max,sensor_3_mean,...,sensor_19_min,sensor_19_max,sensor_20_mean,sensor_20_std,sensor_20_min,sensor_20_max,sensor_21_mean,sensor_21_std,sensor_21_min,sensor_21_max
0,1,518.67,0.0,518.67,518.67,642.621042,0.486795,641.71,644.21,1589.485521,...,100.0,100.0,38.840052,0.166998,38.34,39.18,23.30631,0.105101,22.9588,23.4999
1,2,518.67,0.0,518.67,518.67,642.435226,0.560214,641.27,643.94,1588.181986,...,100.0,100.0,38.90115,0.199332,38.23,39.24,23.338997,0.114796,22.9721,23.6005
2,3,518.67,0.0,518.67,518.67,642.543743,0.47419,641.48,643.93,1588.715084,...,100.0,100.0,38.882793,0.166112,38.37,39.23,23.321931,0.104734,22.9562,23.5181
3,4,518.67,0.0,518.67,518.67,642.662381,0.441446,641.81,644.53,1590.56328,...,100.0,100.0,38.830265,0.159302,38.29,39.21,23.294502,0.093804,23.0135,23.5074
4,5,518.67,0.0,518.67,518.67,642.45197,0.551542,641.3,644.02,1588.153271,...,100.0,100.0,38.891078,0.187133,38.34,39.29,23.336284,0.112705,23.0195,23.5503


# Rolling Statistics and Trends

In [7]:
# Add rolling means (window=5 cycles) and rolling std for each sensor per engine
for col in sensor_cols:
  df[f"{col}_rollmean5"] = df.groupby('engine_id')[col].rolling(window=5, min_periods=1).mean().reset_index(level=0,drop=True)
  df[f"{col}_rollstd5"] = df.groupby('engine_id')[col].rolling(window=5, min_periods=1).std().reset_index(level=0,drop=True)

# Prepare the list of columns to display
cols_to_show = sensor_cols + [f"{col}_rollmean5" for col in  sensor_cols] + [f"{col}_rollstd5" for col in sensor_cols]

# Display the first 10 rows of the selected columns
df[cols_to_show].head(10)

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,...,sensor_12_rollstd5,sensor_13_rollstd5,sensor_14_rollstd5,sensor_15_rollstd5,sensor_16_rollstd5,sensor_17_rollstd5,sensor_18_rollstd5,sensor_19_rollstd5,sensor_20_rollstd5,sensor_21_rollstd5
0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,...,,,,,,,,,,
1,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,...,0.438406,0.035355,5.041671,0.008697,0.0,0.0,0.0,0.0,0.042426,0.003253
2,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,...,0.404475,0.026458,3.71745,0.00764,0.0,1.154701,0.0,0.0,0.055076,0.044573
3,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,...,0.49595,0.029439,3.050906,0.028117,0.0,1.0,0.0,0.0,0.076322,0.037977
4,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,...,0.432574,0.025884,2.651326,0.025953,0.0,1.095445,0.0,0.0,0.073621,0.033498
5,518.67,642.1,1584.47,1398.37,14.62,21.61,554.67,2388.02,9049.68,1.3,...,0.425417,0.023452,0.958697,0.025727,0.0,1.140175,0.0,0.0,0.051186,0.031436
6,518.67,642.48,1592.32,1397.77,14.62,21.61,554.34,2388.02,9059.13,1.3,...,0.425652,0.021679,0.643141,0.023476,0.0,1.140175,0.0,0.0,0.086718,0.021634
7,518.67,642.56,1582.96,1400.97,14.62,21.61,553.85,2388.0,9040.8,1.3,...,0.429919,0.021679,1.149274,0.022477,0.0,0.83666,0.0,0.0,0.086487,0.034405
8,518.67,642.12,1590.98,1394.8,14.62,21.61,553.69,2388.05,9046.46,1.3,...,0.341101,0.008944,3.205438,0.02074,0.0,0.83666,0.0,0.0,0.077136,0.038939
9,518.67,641.71,1591.24,1400.46,14.62,21.61,553.59,2388.05,9051.7,1.3,...,0.35826,0.014142,2.883881,0.020493,0.0,0.83666,0.0,0.0,0.062849,0.058103


In [8]:
## Normalize all sensor columns and rolling feature columns (per sensor globally for simplicity)
features_to_scale = [col for col in df.columns if ('sensor_' in col) or ('roll' in col)]

## StandardScaler (mean=0, std=1)
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_scaled[features_to_scale])
df_scaled.head()

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_17_rollmean5,sensor_17_rollstd5,sensor_18_rollmean5,sensor_18_rollstd5,sensor_19_rollmean5,sensor_19_rollstd5,sensor_20_rollmean5,sensor_20_rollstd5,sensor_21_rollmean5,sensor_21_rollstd5
0,1,1,-0.0007,-0.0004,100.0,0.0,-1.721725,-0.134255,-0.925936,-1.776357e-15,...,-0.926028,,0.0,,0.0,,1.564172,,1.379118,
1,1,2,0.0019,-0.0003,100.0,0.0,-1.06178,0.211528,-0.643726,-1.776357e-15,...,-0.926028,-2.638069,0.0,0.0,0.0,0.0,1.367661,-1.534785,1.404213,-2.630752
2,1,3,-0.0043,0.0003,100.0,0.0,-0.661813,-0.413166,-0.525953,-1.776357e-15,...,-1.453702,0.786588,0.0,0.0,0.0,0.0,1.192984,-1.166192,1.123794,-0.599129
3,1,4,0.0007,0.0,100.0,0.0,-0.661813,-1.261314,-0.784831,-1.776357e-15,...,-1.321784,0.327771,0.0,0.0,0.0,0.0,0.991014,-0.547098,1.0646,-0.923458
4,1,5,-0.0019,-0.0002,100.0,0.0,-0.621816,-1.251528,-0.301518,-1.776357e-15,...,-1.08433,0.610846,0.0,0.0,0.0,0.0,0.896034,-0.625804,1.095643,-1.143645


In [9]:
# Confirma scaled feature distribution
df_scaled[features_to_scale].describe().T[['mean', 'std']]

Unnamed: 0,mean,std
sensor_1,0.000000e+00,0.000000
sensor_2,6.410348e-14,1.000024
sensor_3,-4.959437e-14,1.000024
sensor_4,9.285169e-16,1.000024
sensor_5,-1.776357e-15,0.000000
...,...,...
sensor_19_rollstd5,0.000000e+00,0.000000
sensor_20_rollmean5,-2.134211e-14,1.000024
sensor_20_rollstd5,1.958829e-16,1.000024
sensor_21_rollmean5,-2.148263e-14,1.000024


# Feature Matrix Construction & Validation

In [10]:
# Select all numeric columns except engine_id and cycle as feature matrix
exclude_cols = ['engine_id', 'cycle']
feature_cols = [col for col in df_scaled.columns if col not in exclude_cols]

# check for missing values
print("Missing values per feature column:")
print(df_scaled[feature_cols].isnull().sum())

Missing values per feature column:
op_setting_1             0
op_setting_2             0
op_setting_3             0
sensor_1                 0
sensor_2                 0
                      ... 
sensor_19_rollstd5     100
sensor_20_rollmean5      0
sensor_20_rollstd5     100
sensor_21_rollmean5      0
sensor_21_rollstd5     100
Length: 66, dtype: int64


In [11]:
# Final feature matrix
X = df_scaled[feature_cols]
print(f"Final feature matrix shape: {X.shape}")
X.head()

Final feature matrix shape: (20631, 66)


Unnamed: 0,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,...,sensor_17_rollmean5,sensor_17_rollstd5,sensor_18_rollmean5,sensor_18_rollstd5,sensor_19_rollmean5,sensor_19_rollstd5,sensor_20_rollmean5,sensor_20_rollstd5,sensor_21_rollmean5,sensor_21_rollstd5
0,-0.0007,-0.0004,100.0,0.0,-1.721725,-0.134255,-0.925936,-1.776357e-15,0.141683,1.121141,...,-0.926028,,0.0,,0.0,,1.564172,,1.379118,
1,0.0019,-0.0003,100.0,0.0,-1.06178,0.211528,-0.643726,-1.776357e-15,0.141683,0.43193,...,-0.926028,-2.638069,0.0,0.0,0.0,0.0,1.367661,-1.534785,1.404213,-2.630752
2,-0.0043,0.0003,100.0,0.0,-0.661813,-0.413166,-0.525953,-1.776357e-15,0.141683,1.008155,...,-1.453702,0.786588,0.0,0.0,0.0,0.0,1.192984,-1.166192,1.123794,-0.599129
3,0.0007,0.0,100.0,0.0,-0.661813,-1.261314,-0.784831,-1.776357e-15,0.141683,1.222827,...,-1.321784,0.327771,0.0,0.0,0.0,0.0,0.991014,-0.547098,1.0646,-0.923458
4,-0.0019,-0.0002,100.0,0.0,-0.621816,-1.251528,-0.301518,-1.776357e-15,0.141683,0.714393,...,-1.08433,0.610846,0.0,0.0,0.0,0.0,0.896034,-0.625804,1.095643,-1.143645


#### Feature Engineering Summary

- Features Created:
  - Aggregate statistics per engine: mean, std, min, max for each sensor (static features)
  - Rolling-window features (window=5): rolling mean and std trend for each sensor per cycle/engine (dynamic features)
  - All raw, aggregate, and rolling features scaled using StandardScaler (zero mean, unit variance)

- Validation:
  - Verified presence of missing values in the final feature set.
  - Final feature matrix contains [INSERT VALUE: X.shape] engineered features per cycle.