In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Initialization

In [None]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/train_FD002.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/test_FD002.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/RUL_FD002.txt', delim_whitespace=True, header=None)

In [None]:
column_names = ["engine_id" , "time_in_cycles" , "altitude" , "mach_no" , "throttle_angle" , "fan_inlet_temp" , "LPC_outlet_temp" , "HPC_outlet_temp" , "LPT_outlet_temp" , "fan_inlet_pressure" , "bypass_duct_pressure" , "HPC_outlet_pressure" , "fan_speed" , "core_speed" , "engine_pressure_ratio" , "HPC_outlet_static_pressure" , "fuel_ps30_ratio" , "corrected_fan_speed" , "corrected_core_speed" , "bypass_ratio" , "burner_fuel_air_ratio" , "bleed_enthalpy" , "demanded_fan_speed" , "demanded_corrected_fan_speed" , "HPT_coolant_bleed" , "LPT_coolant_bleed"]
train_data.columns = column_names
test_data.columns = column_names

In [None]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

In [None]:
# pd.options.display.max_rows = 1000
# train_data['RUL'].value_counts()

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Scaling and PCA

In [None]:
# Feature selection (excluding engine_id, time_in_cycles, and RUL)
features = train_data.columns.difference(['engine_id', 'time_in_cycles', 'RUL'])

# Standardize features
scaler = StandardScaler()
train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.transform(test_data[features])

In [None]:
# Feature Engineering: PCA for dimensionality reduction
pca = PCA(n_components=10)
train_data_pca = pca.fit_transform(train_data[features])
test_data_pca = pca.transform(test_data[features])

# Binning

In [None]:
# Discretize the RUL into bins
num_bins = 20  # Increase the number of bins for more granularity
kbins = KBinsDiscretizer(n_bins=num_bins, encode='ordinal', strategy='uniform')
train_data['RUL_binned'] = kbins.fit_transform(train_data[['RUL']])

# Prepare target variable
y_train_binned = train_data['RUL_binned']

# Train Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(train_data_pca, y_train_binned)

# LR

In [None]:
# Extract the last cycle for each engine in the test set
last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles = test_data.loc[last_cycle_indices, features]
X_test_last_cycles_pca = pca.transform(X_test_last_cycles)

# Predict the RUL bins for the last cycles of each engine
y_pred_binned = gnb.predict(X_test_last_cycles_pca)

# Convert the predicted bins back to continuous RUL values using the bin midpoints
bin_edges = kbins.bin_edges_[0]
bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
y_pred_continuous = bin_midpoints[y_pred_binned.astype(int)]

# True RUL values from the provided RUL file
true_rul = rul_data.values.flatten()

In [None]:
# # Calculate evaluation metrics
# # for bins = 20 and pca components = 10
# mse = mean_squared_error(true_rul, y_pred_continuous)
# rmse = np.sqrt(mse)
# r2 = r2_score(true_rul, y_pred_continuous)
# mae = mean_absolute_error(true_rul, y_pred_continuous)

# print(f'Naive Bayes - Root Mean Squared Error (RMSE): {rmse}')
# print(f'Naive Bayes - R² Score: {r2}')
# print(f'Naive Bayes - Mean Absolute Error (MAE): {mae}')

Naive Bayes - Root Mean Squared Error (RMSE): 34.55778870435918
Naive Bayes - R² Score: 0.5870763389902611
Naive Bayes - Mean Absolute Error (MAE): 25.410328185328183


In [None]:
# # Calculate evaluation metrics
# # # for bins = 20 and pca components = 15
# mse = mean_squared_error(true_rul, y_pred_continuous)
# rmse = np.sqrt(mse)
# r2 = r2_score(true_rul, y_pred_continuous)
# mae = mean_absolute_error(true_rul, y_pred_continuous)

# print(f'Naive Bayes - Root Mean Squared Error (RMSE): {rmse}')
# print(f'Naive Bayes - R² Score: {r2}')
# print(f'Naive Bayes - Mean Absolute Error (MAE): {mae}')

Naive Bayes - Root Mean Squared Error (RMSE): 35.03799795473017
Naive Bayes - R² Score: 0.5755207696163245
Naive Bayes - Mean Absolute Error (MAE): 25.518243243243244


# XGB

## just xgboost with naive bayes

In [None]:
import xgboost as xgb

# Extract the last cycle for each engine in the test set
last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles = test_data.loc[last_cycle_indices, features]
X_test_last_cycles_pca = pca.transform(X_test_last_cycles)

# Use XGBoost to train on the PCA-transformed features
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(train_data_pca, train_data['RUL'])

# Make predictions for the last cycles of each engine using XGBoost
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_pca)

# True RUL values from the provided RUL file
true_rul = rul_data.values.flatten()

# Calculate evaluation metrics
mse_xgb = mean_squared_error(true_rul, y_pred_last_cycles_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(true_rul, y_pred_last_cycles_xgb)
mae_xgb = mean_absolute_error(true_rul, y_pred_last_cycles_xgb)

print(f'XGBoost with PCA and Naive Bayes - Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'XGBoost with PCA and Naive Bayes - R² Score: {r2_xgb}')
print(f'XGBoost with PCA and Naive Bayes - Mean Absolute Error (MAE): {mae_xgb}')

XGBoost with PCA and Naive Bayes - Root Mean Squared Error (RMSE): 30.43566411890036
XGBoost with PCA and Naive Bayes - R² Score: 0.6797099518822081
XGBoost with PCA and Naive Bayes - Mean Absolute Error (MAE): 22.49257443597878


## xgboost combined with naive bayes

In [None]:
import xgboost as xgb
# Extract the last cycle for each engine in the test set
last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles = test_data.loc[last_cycle_indices, features]
X_test_last_cycles_pca = pca.transform(X_test_last_cycles)

# Predict the RUL bins for the last cycles of each engine using Naive Bayes
y_pred_binned = gnb.predict(X_test_last_cycles_pca)

# Convert the predicted bins back to continuous RUL values using the bin midpoints
bin_edges = kbins.bin_edges_[0]
bin_midpoints = (bin_edges[:-1] + bin_edges[1:]) / 2
y_pred_continuous_nb = bin_midpoints[y_pred_binned.astype(int)]

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(train_data[features], train_data['RUL'])

# Make predictions for the last cycles of each engine using XGBoost
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles)

# Combine predictions (here we use simple averaging)
y_pred_combined = (y_pred_continuous_nb + y_pred_last_cycles_xgb) / 2

# True RUL values from the provided RUL file
true_rul = rul_data.values.flatten()

# Calculate evaluation metrics for combined model
mse_combined = mean_squared_error(true_rul, y_pred_combined)
rmse_combined = np.sqrt(mse_combined)
r2_combined = r2_score(true_rul, y_pred_combined)
mae_combined = mean_absolute_error(true_rul, y_pred_combined)

print(f'Combined Model - Root Mean Squared Error (RMSE): {rmse_combined}')
print(f'Combined Model - R² Score: {r2_combined}')
print(f'Combined Model - Mean Absolute Error (MAE): {mae_combined}')

Combined Model - Root Mean Squared Error (RMSE): 29.891337829184224
Combined Model - R² Score: 0.6910639529951415
Combined Model - Mean Absolute Error (MAE): 21.454691449164425
