In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import warnings
warnings.filterwarnings("ignore")

# DATASET INITIALIZATION

In [4]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv('/kaggle/input/new-dataset/train_FD002.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/kaggle/input/new-dataset/test_FD002.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/kaggle/input/new-dataset/RUL_FD002.txt', delim_whitespace=True, header=None)

In [5]:
print(train_data.shape)
print(test_data.shape)

(53759, 26)
(33991, 26)


In [6]:
train_data.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
dtype: int64

In [7]:
column_names = ["engine_id" , "time_in_cycles" , "altitude" , "mach_no" , "throttle_angle" , "fan_inlet_temp" , "LPC_outlet_temp" , "HPC_outlet_temp" , "LPT_outlet_temp" , "fan_inlet_pressure" , "bypass_duct_pressure" , "HPC_outlet_pressure" , "fan_speed" , "core_speed" , "engine_pressure_ratio" , "HPC_outlet_static_pressure" , "fuel_ps30_ratio" , "corrected_fan_speed" , "corrected_core_speed" , "bypass_ratio" , "burner_fuel_air_ratio" , "bleed_enthalpy" , "demanded_fan_speed" , "demanded_corrected_fan_speed" , "HPT_coolant_bleed" , "LPT_coolant_bleed"]
train_data.columns = column_names
test_data.columns = column_names

In [8]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

In [9]:
# matrix = train_data.corr()
# print(matrix)

# REMOVING OUTLIERS

In [10]:
from scipy import stats
import numpy as np

# Calculate Z-scores for sensor measurements and operational settings
z_scores = np.abs(stats.zscore(train_data.iloc[:, 2:-1]))

# Set a threshold for Z-score
threshold = 3

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Drop outliers
train_data_cleaned = train_data[~outliers]


In [11]:
# Calculate Z-scores for test data
z_scores_test = np.abs(stats.zscore(test_data.iloc[:, 2:]))

# Identify outliers in test data
outliers_test = (z_scores_test > threshold).any(axis=1)

# Drop outliers from test data
test_data_cleaned = test_data[~outliers_test]

# SCALING

In [13]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
train_data_cleaned.iloc[:, 2:-1] = scaler.fit_transform(train_data_cleaned.iloc[:, 2:-1])
test_data_cleaned.iloc[:, 2:] = scaler.transform(test_data_cleaned.iloc[:, 2:])

# LINEAR REGRESSION

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_lr = train_data_cleaned[features]
y_train_lr = train_data_cleaned['RUL']

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train_lr, y_train_lr)

# Extract the last cycle for each engine in the test set
last_cycle_indices_lr = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_lr = test_data_cleaned.loc[last_cycle_indices_lr, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_lr = model_lr.predict(X_test_last_cycles_lr)

# True RUL values from the provided RUL file
true_rul_lr = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_lr = mean_squared_error(true_rul_lr, y_pred_last_cycles_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(true_rul_lr, y_pred_last_cycles_lr)
mae_lr = mean_absolute_error(true_rul_lr, y_pred_last_cycles_lr)

print(f'Linear Regression - Root Mean Squared Error (RMSE): {rmse_lr}')
print(f'Linear Regression - R² Score: {r2_lr}')
print(f'Linear Regression - Mean Absolute Error (MAE): {mae_lr}')

Linear Regression - Root Mean Squared Error (RMSE): 33.9427252822187
Linear Regression - R² Score: 0.6016440598064444
Linear Regression - Mean Absolute Error (MAE): 27.547646570031702


In [15]:
X_test_last_cycles_lr.shape

(259, 24)

# XGBOOST

In [16]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(X_train_lr, y_train_lr)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_lr)

# Calculate RMSE and R² Score
mse_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
mae_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

print(f'XGBoost - Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'XGBoost - R² Score: {r2_xgb}')
print(f'XGBoost - Mean Absolute Error (MAE): {mae_xgb}')

XGBoost - Root Mean Squared Error (RMSE): 30.766703389572502
XGBoost - R² Score: 0.6727046699529016
XGBoost - Mean Absolute Error (MAE): 23.045762931287978


# HYPERPARAMETER TUNING

In [17]:
# from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# import numpy as np

# # Hyperparameter tuning for XGBoost
# param_grid_xgb = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1.0]
# }

# grid_xgb = GridSearchCV(xgb.XGBRegressor(objective='reg:squarederror'), param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
# grid_xgb.fit(X_train_lr, y_train_lr)

# print("Best parameters for XGBoost:", grid_xgb.best_params_)
# print("Best RMSE for XGBoost (Cross-Validation):", np.sqrt(-grid_xgb.best_score_))

# # Train the XGBoost model with best parameters
# best_model_xgb = grid_xgb.best_estimator_

# # Predictions for training data
# y_pred_train_xgb = best_model_xgb.predict(X_train_lr)

# # Make predictions for the last cycles of each engine in test data
# y_pred_last_cycles_xgb = best_model_xgb.predict(X_test_last_cycles_lr)

# # True RUL values from the provided RUL file
# true_rul_lr = rul_data.values.flatten()

# # # Calculate RMSE, R² Score, and MAE for test data
# # mse_test_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
# # rmse_test_xgb = np.sqrt(mse_test_xgb)
# # r2_test_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
# # mae_test_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

# # Calculate RMSE, R² Score, and MAE for train data
# mse_train_xgb = mean_squared_error(y_train_lr, y_pred_train_xgb)
# rmse_train_xgb = np.sqrt(mse_train_xgb)
# r2_train_xgb = r2_score(y_train_lr, y_pred_train_xgb)
# mae_train_xgb = mean_absolute_error(y_train_lr, y_pred_train_xgb)

# # print(f'XGBoost - Root Mean Squared Error (RMSE) on Test Data: {rmse_test_xgb}')
# # print(f'XGBoost - R² Score on Test Data: {r2_test_xgb}')
# # print(f'XGBoost - Mean Absolute Error (MAE) on Test Data: {mae_test_xgb}')
# print()
# print(f'XGBoost - Root Mean Squared Error (RMSE) on Train Data: {rmse_train_xgb}')
# print(f'XGBoost - R² Score on Train Data: {r2_train_xgb}')
# print(f'XGBoost - Mean Absolute Error (MAE) on Train Data: {mae_train_xgb}')


Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.8}
Best RMSE for XGBoost (Cross-Validation): 44.02973152891171

XGBoost - Root Mean Squared Error (RMSE) on Train Data: 40.46136410097402
XGBoost - R² Score on Train Data: 0.657925370688508
XGBoost - Mean Absolute Error (MAE) on Train Data: 29.61741439976453


Implementing the best parameters

In [18]:
# import numpy as np
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# # Best parameters for XGBoost obtained from cross-validation
# best_params = {
#     'learning_rate': 0.1,
#     'max_depth': 7,
#     'n_estimators': 50,
#     'subsample': 0.8
# }

# # Train the XGBoost model with the best parameters
# model_xgb = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
# model_xgb.fit(X_train_lr, y_train_lr)

# # Predictions for training data
# y_pred_train_xgb = model_xgb.predict(X_train_lr)

# # Make predictions for the last cycles of each engine in test data
# y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_lr)

# # True RUL values from the provided RUL file
# true_rul_lr = rul_data.values.flatten()

# # Calculate RMSE, R² Score, and MAE for test data
# mse_test_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
# rmse_test_xgb = np.sqrt(mse_test_xgb)
# r2_test_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
# mae_test_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

# # Calculate RMSE, R² Score, and MAE for train data
# mse_train_xgb = mean_squared_error(y_train_lr, y_pred_train_xgb)
# rmse_train_xgb = np.sqrt(mse_train_xgb)
# r2_train_xgb = r2_score(y_train_lr, y_pred_train_xgb)
# mae_train_xgb = mean_absolute_error(y_train_lr, y_pred_train_xgb)

# print(f'XGBoost - Root Mean Squared Error (RMSE) on Test Data: {rmse_test_xgb}')
# print(f'XGBoost - R² Score on Test Data: {r2_test_xgb}')
# print(f'XGBoost - Mean Absolute Error (MAE) on Test Data: {mae_test_xgb}')
# print()
# print(f'XGBoost - Root Mean Squared Error (RMSE) on Train Data: {rmse_train_xgb}')
# print(f'XGBoost - R² Score on Train Data: {r2_train_xgb}')
# print(f'XGBoost - Mean Absolute Error (MAE) on Train Data: {mae_train_xgb}')

# print()

# # Best RMSE from cross-validation
# best_rmse_cv = 44.02973152891171
# print(f'Best RMSE for XGBoost (Cross-Validation): {best_rmse_cv}')


XGBoost - Root Mean Squared Error (RMSE) on Test Data: 30.214729064790482
XGBoost - R² Score on Test Data: 0.6843430993934015
XGBoost - Mean Absolute Error (MAE) on Test Data: 22.28316208754727

XGBoost - Root Mean Squared Error (RMSE) on Train Data: 40.46136410097402
XGBoost - R² Score on Train Data: 0.657925370688508
XGBoost - Mean Absolute Error (MAE) on Train Data: 29.61741439976453

Best RMSE for XGBoost (Cross-Validation): 44.02973152891171


# BFE with XGBoost

In [19]:
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import xgboost as xgb

In [20]:
# Function to calculate model performance
def calculate_performance(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse= np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    return rmse, mae, r2

In [21]:
# Backward Feature Elimination for XGBoost
def backward_feature_elimination_xgb(X_train, y_train, X_test, y_test):
    features = X_train.columns.tolist()
    best_features = features[:]
    best_rmse = float('inf')
    best_mae = float('inf')
    best_r2 = float('-inf')
    
    while len(features) > 1:
        scores = {}
        for feature in features:
            remaining_features = [f for f in features if f != feature]
            X_train_subset = X_train[remaining_features]
            X_test_subset = X_test[remaining_features]
            
            model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
            rmse, mae, r2 = calculate_performance(model, X_train_subset, y_train, X_test_subset, y_test)
            scores[feature] = (rmse, mae, r2)
        
        # Find the worst feature to remove
        worst_feature = max(scores, key=lambda k: scores[k][0])
        worst_rmse, worst_mae, worst_r2 = scores[worst_feature]
        features.remove(worst_feature)
        
        if worst_rmse < best_rmse:
            best_rmse = worst_rmse
            best_mae = worst_mae
            best_r2 = worst_r2
            best_features = features[:]
        else:
            break
    
    return best_features, best_rmse, best_mae, best_r2

In [22]:
# Prepare the data
features = train_data_cleaned.columns[2:-1]
X_train = train_data_cleaned[features]
y_train = train_data_cleaned['RUL']
X_test = test_data_cleaned.loc[last_cycle_indices_lr, features]
y_test = rul_data.values.flatten()

# Perform BFE with XGBoost
selected_features, best_rmse, best_mae, best_r2 = backward_feature_elimination_xgb(X_train, y_train, X_test, y_test)
print(f'XGBoost - Selected features: {selected_features}')
print(f'XGBoost - Best RMSE: {best_rmse}')
print(f'XGBoost - Best MAE: {best_mae}')
print(f'XGBoost - Best R²: {best_r2}')

XGBoost - Selected features: ['altitude', 'mach_no', 'throttle_angle', 'fan_inlet_temp', 'LPC_outlet_temp', 'HPC_outlet_temp', 'LPT_outlet_temp', 'fan_inlet_pressure', 'bypass_duct_pressure', 'HPC_outlet_pressure', 'fan_speed', 'core_speed', 'engine_pressure_ratio', 'HPC_outlet_static_pressure', 'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed', 'burner_fuel_air_ratio', 'bleed_enthalpy', 'demanded_fan_speed', 'demanded_corrected_fan_speed', 'HPT_coolant_bleed', 'LPT_coolant_bleed']
XGBoost - Best RMSE: 31.76500731701249
XGBoost - Best MAE: 24.06389452522786
XGBoost - Best R²: 0.6511202208322515


In [26]:
train_data_cleaned.shape

(53759, 27)

In [25]:
print(len(selected_features))

23
