# **IMPORTS**

---

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import os

In [31]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import sklearn

In [5]:
import sklearn, pandas, numpy, matplotlib, seaborn, xgboost

print(f"Scikit-learn: {sklearn.__version__}")
print(f"Pandas: {pandas.__version__}")
print(f"Numpy: {numpy.__version__}")
print(f"Matplotlib: {matplotlib.__version__}")
print(f"Seaborn: {seaborn.__version__}")
print(f"XGBoost: {xgboost.__version__}")

Scikit-learn: 1.5.1
Pandas: 2.2.2
Numpy: 1.26.4
Matplotlib: 3.9.2
Seaborn: 0.13.2
XGBoost: 3.0.2


```
⠀⠀⠀⠀⠀⠀⣀⣤⡤
⠀⠀⠀⠀⢀⣾⣿⠋
⠀⠀⠀⣠⣾⣿⡟
⠀⠀⢸⠛⠉⢹⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢀⡠⠄⠠⣀
⠀⠀⡘⠀⠀⠀⡀⠀⠀⠀⠀⠀⠀⠀⠀⣠⠖⠉⠀⠀⠀⣾⣿⣦⡀
⠀⠀⡇⠀⠀⠀⢡⠄⠀⠀⣀⣀⣀⣠⠊⠀⠀⠀⠀⡠⠞⠛⠛⠛⠛⡀
⠀⠀⢃⠀⠀⠀⠀⠗⠚⠉⠉⠀⠈⠁⠀⠀⠀⢀⡔⠁⠀
⠀⠀⠸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣴⣶⣄⠲⡎
⠀⠀⠀⠃⠀⠀⢠⣤⡀⠀⠀⠀⠀⣿⣿⣿⠀⠘⡄
⠀⠀⠀⡆⠀⠀⣿⣿⡇⠀⠀⠀⠀⠈⠛⠉⣴⣆⢹⡄
⠀⠀⠀⣇⢰⡧⣉⡉⠀⠀⢀⡀⠀⣀⣀⣠⣿⡷⢠⡇
⠀⠀⠀⢻⠘⠃⠈⠻⢦⠞⠋⠙⠺⠋⠉⠉⠉⢡⠟
⠀⠀⠀⠀⠳⢄⡀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⠋⠀⠀
```

# **SETTINGS**
---

In [6]:
# Matplotlib inline to visualize Matplotlib graphs
%matplotlib inline

# Configuration to set so that all the Seaborn figures come out with this size
%config Inlinebackend.figure_format= 'retina'

In [7]:
# Set the Seaborn context to "poster" for larger text and figures
sns.set_context("poster")

# Set the default figure size for Seaborn plots
sns.set(rc={"figure.figsize": (12., 6.)})

# Set the Seaborn style to "whitegrid" for a white background with gridlines
sns.set_style("whitegrid")

In [8]:
# Set the max displayable columns to max
pd.set_option('display.max_columns', None)

In [9]:
# Activates XLA (for JIT compiler)
os.environ["TF_XLA_FLAGS"]= "--tf_xla_enable_xla_devices"

# Uses the right memory when using GPU
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]= "true"

# **DATA**

---

In [13]:
# data path for loading
fd001_train_path= "../data/02_processed/FD001_train.csv"
fd001_test_path=  "../data/02_processed/FD001_test.csv"
fd001_rul_path=   "../data/02_processed/FD001_RUL.csv"

In [14]:
# loading the train dataset
FD001_train= pd.read_csv(fd001_train_path)
FD001_train.head(3)

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_measure_1,sensor_measure_2,sensor_measure_3,sensor_measure_4,sensor_measure_5,sensor_measure_6,sensor_measure_7,sensor_measure_8,sensor_measure_9,sensor_measure_10,sensor_measure_11,sensor_measure_12,sensor_measure_13,sensor_measure_14,sensor_measure_15,sensor_measure_16,sensor_measure_17,sensor_measure_18,sensor_measure_19,sensor_measure_20,sensor_measure_21,max_cycle,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,192,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,192,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,192,189


In [15]:
# loading the test dataset
FD001_test= pd.read_csv(fd001_test_path)
FD001_test.head(3)

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_measure_1,sensor_measure_2,sensor_measure_3,sensor_measure_4,sensor_measure_5,sensor_measure_6,sensor_measure_7,sensor_measure_8,sensor_measure_9,sensor_measure_10,sensor_measure_11,sensor_measure_12,sensor_measure_13,sensor_measure_14,sensor_measure_15,sensor_measure_16,sensor_measure_17,sensor_measure_18,sensor_measure_19,sensor_measure_20,sensor_measure_21,max_cycle,RUL
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,21.61,553.9,2388.04,9050.17,1.3,47.2,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735,192,191
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,21.61,554.85,2388.01,9054.42,1.3,47.5,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916,192,190
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,21.61,554.11,2388.05,9056.96,1.3,47.5,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166,192,189


In [16]:
# loading the rul for validation
FD001_rul = pd.read_csv(fd001_rul_path)
FD001_rul.head(3)

Unnamed: 0,true_rul
0,112
1,98
2,69


# **PREPARING THE DATA**

---

This should be done at previous notebook, but since we want to take advantage of models such as XGBR for handling outliers, we will do it here for comprehension in the process.

## Column drops

We saw before that there are some values that we don't actually need for our model. Let's drop them.

In [17]:
drop_cols= ["op_setting_3", "sensor_measure_1", "sensor_measure_5", 
            "sensor_measure_10", "sensor_measure_16", "sensor_measure_18", 
            "sensor_measure_19"]

In [21]:
X= FD001_train.drop(["RUL", "max_cycle"] + drop_cols, axis=1)
y= FD001_train["RUL"]

In [39]:
X.head(1)

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,sensor_measure_2,sensor_measure_3,sensor_measure_4,sensor_measure_6,sensor_measure_7,sensor_measure_8,sensor_measure_9,sensor_measure_11,sensor_measure_12,sensor_measure_13,sensor_measure_14,sensor_measure_15,sensor_measure_17,sensor_measure_20,sensor_measure_21
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, random_state=69)

## Normalization

In [40]:
# Knowing the sensors we want to normalize, we'll create a mask to apply to the data.

# We create a mask to apply RobustScaler to the data, the rest will be normalized with MinMaxScaler.
sensors_to_standardize= ["sensor_measure_2", "sensor_measure_3", "sensor_measure_4", 
                         "sensor_measure_7", "sensor_measure_8", "sensor_measure_9", 
                         "sensor_measure_11", "sensor_measure_12", "sensor_measure_13"]

In [41]:
# Robust Scaler

op_settings_columns= ["op_setting_1", "op_setting_2"]

preprocessor= ColumnTransformer(
  [
    ('sensors', RobustScaler(), sensors_to_standardize),
    ('op_settings', MinMaxScaler(feature_range=(-1, 1)), op_settings_columns),
  ],
  remainder= "passthrough"
)

# **BASELINE MODELS**

---

In [50]:
# Function to evaluate models

def eval_model(name, y_train_true, y_train_pred, y_test_true, y_test_pred):
  # Training metrics
  mae_train=  mean_absolute_error(y_train_true, y_train_pred)
  rmse_train= np.sqrt(mean_squared_error(y_train_true, y_train_pred))
  r2_train=   r2_score(y_train_true, y_train_pred)

  # Test metrics
  mae_test=   mean_absolute_error(y_test_true, y_test_pred)
  rmse_test=  np.sqrt(mean_squared_error(y_test_true, y_test_pred))
  r2_test=    r2_score(y_test_true, y_test_pred)

  print(f"{name} metrics:")
  print(f"  Train - MAE: {mae_train:.3f}, RMSE: {rmse_train:.3f}, R²: {r2_train:.3f}")
  print(f"  Test  - MAE: {mae_test:.3f}, RMSE: {rmse_test:.3f}, R²: {r2_test:.3f}")

In [53]:
# Random Forest Regressor model
rfr_model= RandomForestRegressor(random_state=69)

# XGB Regressor model
xgbr_model= XGBRegressor(random_state=69)

In [None]:
# Random Forest Pipeline for normalizing data
rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# XGB Regressor Pipeline for normalizing data
xgbr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

## Training

In [54]:
rfr_model.fit(X_train, y_train)
xgbr_model.fit(X_train, y_train)

In [48]:
rfr_pipeline.fit(X_train, y_train)
xgbr_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [58]:
# Predictions on train data without normalization
rfr_train_pred=  rfr_model.predict(X_train)
xgbr_train_pred= xgbr_model.predict(X_train)

# Predictions on train data with normalization
rfr_train_norm_pred=  rfr_pipeline.predict(X_train)
xgbr_train_norm_pred= xgbr_pipeline.predict(X_train)

In [59]:
# Predictions on test data without normalization
rfr_test_pred = rfr_model.predict(X_test)
xgbr_test_pred = xgbr_model.predict(X_test)

# Predictions on test data with normalization
rfr_test_norm_pred=  rfr_pipeline.predict(X_test)
xgbr_test_norm_pred= xgbr_pipeline.predict(X_test)

In [61]:
print("Without normalizing Data:")
eval_model("RandomForestRegressor", y_train, rfr_train_pred, y_test, rfr_test_pred)
eval_model("XGBRegressor", y_train, xgbr_train_pred, y_test, xgbr_test_pred)

print("")

print("Normalizing Data:")
eval_model("RandomForestRegressor", y_train, rfr_train_norm_pred, y_test, rfr_test_norm_pred)
eval_model("XGBRegressor", y_train, xgbr_train_norm_pred, y_test, xgbr_test_norm_pred)

Without normalizing Data:
RandomForestRegressor metrics:
  Train - MAE: 4.364, RMSE: 6.319, R²: 0.992
  Test  - MAE: 11.464, RMSE: 16.444, R²: 0.944
XGBRegressor metrics:
  Train - MAE: 4.413, RMSE: 5.965, R²: 0.992
  Test  - MAE: 7.640, RMSE: 10.623, R²: 0.977

Normalizing Data:
RandomForestRegressor metrics:
  Train - MAE: 4.322, RMSE: 6.254, R²: 0.992
  Test  - MAE: 11.411, RMSE: 16.391, R²: 0.945
XGBRegressor metrics:
  Train - MAE: 4.413, RMSE: 5.965, R²: 0.992
  Test  - MAE: 7.636, RMSE: 10.631, R²: 0.977


As a conclusion, we can see that normalizing the data does not improve the model's performance because we are using tree based models such as Random Forest Regressor and XGBoost Regressor. this normalization would be more useful for linear models, as we see in the next section.