In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DATASET INITIALIZATION

In [None]:
import pandas as pd
import numpy as np

# Load training and test data
train_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/train_FD002.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/test_FD002.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/RUL_FD002.txt', delim_whitespace=True, header=None)

In [None]:
column_names = ["engine_id" , "time_in_cycles" , "altitude" , "mach_no" , "throttle_angle" , "fan_inlet_temp" , "LPC_outlet_temp" , "HPC_outlet_temp" , "LPT_outlet_temp" , "fan_inlet_pressure" , "bypass_duct_pressure" , "HPC_outlet_pressure" , "fan_speed" , "core_speed" , "engine_pressure_ratio" , "HPC_outlet_static_pressure" , "fuel_ps30_ratio" , "corrected_fan_speed" , "corrected_core_speed" , "bypass_ratio" , "burner_fuel_air_ratio" , "bleed_enthalpy" , "demanded_fan_speed" , "demanded_corrected_fan_speed" , "HPT_coolant_bleed" , "LPT_coolant_bleed"]
train_data.columns = column_names
test_data.columns = column_names

In [None]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

In [None]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine_id,53759.0,131.082981,74.463862,1.0,68.0,131.0,195.0,260.0
time_in_cycles,53759.0,109.154746,69.180569,1.0,52.0,104.0,157.0,378.0
altitude,53759.0,23.998407,14.747376,0.0,10.0046,25.0013,41.998,42.008
mach_no,53759.0,0.572056,0.310016,0.0,0.2507,0.7,0.84,0.842
throttle_angle,53759.0,94.04602,14.237735,60.0,100.0,100.0,100.0,100.0
fan_inlet_temp,53759.0,472.910207,26.389707,445.0,445.0,462.54,491.19,518.67
LPC_outlet_temp,53759.0,579.672399,37.289399,535.53,549.57,555.98,607.34,644.52
HPC_outlet_temp,53759.0,1419.971013,105.946341,1243.73,1352.76,1369.18,1499.37,1612.88
LPT_outlet_temp,53759.0,1205.442024,119.123428,1023.77,1123.655,1138.89,1306.85,1439.23
fan_inlet_pressure,53759.0,8.031986,3.613839,3.91,3.91,7.05,10.52,14.62


# Threshold 10%

In [None]:
train_data.columns

Index(['engine_id', 'time_in_cycles', 'altitude', 'mach_no', 'throttle_angle',
       'fan_inlet_temp', 'LPC_outlet_temp', 'HPC_outlet_temp',
       'LPT_outlet_temp', 'fan_inlet_pressure', 'bypass_duct_pressure',
       'HPC_outlet_pressure', 'fan_speed', 'core_speed',
       'engine_pressure_ratio', 'HPC_outlet_static_pressure',
       'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed',
       'bypass_ratio', 'burner_fuel_air_ratio', 'bleed_enthalpy',
       'demanded_fan_speed', 'demanded_corrected_fan_speed',
       'HPT_coolant_bleed', 'LPT_coolant_bleed', 'RUL'],
      dtype='object')

In [None]:
train_data.shape

(53759, 27)

In [None]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

# Calculate variance for each feature
variances = train_data.var()

# Get column names where variance is greater than 0.1 (10% threshold)
selected_columns = variances[variances > 0.1].index

print(selected_columns)

Index(['engine_id', 'time_in_cycles', 'altitude', 'throttle_angle',
       'fan_inlet_temp', 'LPC_outlet_temp', 'HPC_outlet_temp',
       'LPT_outlet_temp', 'fan_inlet_pressure', 'bypass_duct_pressure',
       'HPC_outlet_pressure', 'fan_speed', 'core_speed',
       'HPC_outlet_static_pressure', 'fuel_ps30_ratio', 'corrected_fan_speed',
       'corrected_core_speed', 'bypass_ratio', 'bleed_enthalpy',
       'demanded_fan_speed', 'demanded_corrected_fan_speed',
       'HPT_coolant_bleed', 'LPT_coolant_bleed', 'RUL'],
      dtype='object')


In [None]:
selected_columns.shape

(24,)

In [None]:
# Filter the train_data DataFrame to include only selected columns
train_data_reduced = train_data[selected_columns]

selected_columns1 = selected_columns.drop('RUL')

test_data_reduced = test_data[selected_columns1]

# Add RUL column back to the filtered DataFrame
train_data_reduced['RUL'] = train_data['RUL']

# Describe the filtered DataFrame
train_data_reduced.describe().T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_reduced['RUL'] = train_data['RUL']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine_id,53759.0,131.082981,74.463862,1.0,68.0,131.0,195.0,260.0
time_in_cycles,53759.0,109.154746,69.180569,1.0,52.0,104.0,157.0,378.0
altitude,53759.0,23.998407,14.747376,0.0,10.0046,25.0013,41.998,42.008
throttle_angle,53759.0,94.04602,14.237735,60.0,100.0,100.0,100.0,100.0
fan_inlet_temp,53759.0,472.910207,26.389707,445.0,445.0,462.54,491.19,518.67
LPC_outlet_temp,53759.0,579.672399,37.289399,535.53,549.57,555.98,607.34,644.52
HPC_outlet_temp,53759.0,1419.971013,105.946341,1243.73,1352.76,1369.18,1499.37,1612.88
LPT_outlet_temp,53759.0,1205.442024,119.123428,1023.77,1123.655,1138.89,1306.85,1439.23
fan_inlet_pressure,53759.0,8.031986,3.613839,3.91,3.91,7.05,10.52,14.62
bypass_duct_pressure,53759.0,11.600746,5.431802,5.71,5.72,9.03,15.49,21.61


In [None]:
train_data_reduced.shape

(53759, 24)

In [None]:
test_data_reduced

Unnamed: 0,engine_id,time_in_cycles,altitude,throttle_angle,fan_inlet_temp,LPC_outlet_temp,HPC_outlet_temp,LPT_outlet_temp,fan_inlet_pressure,bypass_duct_pressure,...,HPC_outlet_static_pressure,fuel_ps30_ratio,corrected_fan_speed,corrected_core_speed,bypass_ratio,bleed_enthalpy,demanded_fan_speed,demanded_corrected_fan_speed,HPT_coolant_bleed,LPT_coolant_bleed
0,1,1,9.9987,100.0,489.05,605.03,1497.17,1304.99,10.52,15.49,...,45.61,371.69,2388.18,8114.10,8.6476,369,2319,100.00,28.42,17.1551
1,1,2,20.0026,100.0,491.19,607.82,1481.20,1246.11,9.35,13.66,...,44.26,315.32,2388.12,8053.06,9.2405,364,2324,100.00,24.29,14.8039
2,1,3,35.0045,100.0,449.44,556.00,1359.08,1128.36,5.48,8.00,...,41.80,183.04,2387.75,8053.04,9.3472,333,2223,100.00,14.98,8.9125
3,1,4,42.0066,100.0,445.00,550.17,1349.69,1127.89,3.91,5.71,...,42.21,130.40,2387.72,8066.90,9.3961,332,2212,100.00,10.35,6.4181
4,1,5,24.9985,60.0,462.54,536.72,1253.18,1050.69,7.05,9.03,...,36.76,164.56,2028.05,7865.66,10.8682,305,1915,84.93,14.31,8.5740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33986,259,119,35.0015,100.0,449.44,555.56,1366.01,1129.47,5.48,8.00,...,41.96,183.05,2388.39,8088.36,9.3215,334,2223,100.00,14.94,8.9065
33987,259,120,42.0066,100.0,445.00,549.42,1351.13,1123.86,3.91,5.72,...,42.00,130.74,2388.31,8108.48,9.3542,332,2212,100.00,10.57,6.4075
33988,259,121,42.0061,100.0,445.00,549.65,1349.14,1118.91,3.91,5.72,...,42.15,130.96,2388.34,8098.77,9.3836,331,2212,100.00,10.57,6.4805
33989,259,122,0.0024,100.0,518.67,642.58,1589.61,1408.16,14.62,21.61,...,47.47,522.25,2388.00,8161.85,8.4279,393,2388,100.00,39.08,23.3589


# SCALING

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
train_data_reduced.iloc[:, 2:-1] = scaler.fit_transform(train_data_reduced.iloc[:, 2:-1])
test_data_reduced.iloc[:, 2:] = scaler.transform(test_data_reduced.iloc[:, 2:])

# LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_reduced.columns[2:-1]
X_train_lr = train_data_reduced[features]
y_train_lr = train_data_reduced['RUL']

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train_lr, y_train_lr)

# Extract the last cycle for each engine in the test set
last_cycle_indices_lr = test_data_reduced.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_lr = test_data_reduced.loc[last_cycle_indices_lr, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_lr = model_lr.predict(X_test_last_cycles_lr)

# True RUL values from the provided RUL file
true_rul_lr = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_lr = mean_squared_error(true_rul_lr, y_pred_last_cycles_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(true_rul_lr, y_pred_last_cycles_lr)
mae_lr = mean_absolute_error(true_rul_lr, y_pred_last_cycles_lr)

print(f'Linear Regression - Root Mean Squared Error (RMSE): {rmse_lr}')
print(f'Linear Regression - R² Score: {r2_lr}')
print(f'Linear Regression - Mean Absolute Error (MAE): {mae_lr}')

Linear Regression - Root Mean Squared Error (RMSE): 33.95305408085314
Linear Regression - R² Score: 0.6014015828523676
Linear Regression - Mean Absolute Error (MAE): 27.552450864534936


# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_reduced.columns[2:-1]
X_train_rf = train_data_reduced[features]
y_train_rf = train_data_reduced['RUL']

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Extract the last cycle for each engine in the test set
last_cycle_indices_rf = test_data_reduced.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_rf = test_data_reduced.loc[last_cycle_indices_rf, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_rf = rf_model.predict(X_test_last_cycles_rf)

# True RUL values from the provided RUL file
true_rul_rf = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)
mae_rf = mean_absolute_error(true_rul_rf, y_pred_last_cycles_rf)

print(f'Random Forest - Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'Random Forest - R² Score: {r2_rf}')
print(f'Random Forest - Mean Absolute Error (MAE): {mae_rf}')

Random Forest - Root Mean Squared Error (RMSE): 85.65303481923553
Random Forest - R² Score: -1.5366665959475734
Random Forest - Mean Absolute Error (MAE): 68.08270270270269


# Support Vector Regressor (SVR) model

In [None]:
from sklearn.svm import SVR

# Select features and target
features = train_data_reduced.columns[2:-1]
X_train = train_data_reduced[features]
y_train = train_data_reduced['RUL']

# Train the SVR model
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Extract the last cycle for each engine in the test set
last_cycle_indices = test_data_reduced.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles = test_data_reduced.loc[last_cycle_indices, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles = svr_model.predict(X_test_last_cycles)

# True RUL values from the provided RUL file
true_rul = rul_data.values.flatten()

# Make predictions for the training set
y_train_pred = svr_model.predict(X_train)

# Calculate metrics for the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for the test set
test_mse = mean_squared_error(true_rul, y_pred_last_cycles)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(true_rul, y_pred_last_cycles)
test_r2 = r2_score(true_rul, y_pred_last_cycles)

# Print training metrics
print(f'Training Root Mean Squared Error (RMSE): {train_rmse}')
print(f'Training Mean Absolute Error (MAE): {train_mae}')
print(f'Training R² Score: {train_r2}')

# Print testing metrics
print(f'Testing Root Mean Squared Error (RMSE): {test_rmse}')
print(f'Testing Mean Absolute Error (MAE): {test_mae}')
print(f'Testing R² Score: {test_r2}')

Training Root Mean Squared Error (RMSE): 44.9727518957578
Training Mean Absolute Error (MAE): 32.70686873404441
Training R² Score: 0.5773909916032575
Testing Root Mean Squared Error (RMSE): 295.1509202587355
Testing Mean Absolute Error (MAE): 290.2101084144411
Testing R² Score: -29.120776435758373


# XGBOOST

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(X_train_lr, y_train_lr)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_lr)

# Calculate RMSE and R² Score
mse_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
mae_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

print(f'XGBoost - Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'XGBoost - R² Score: {r2_xgb}')
print(f'XGBoost - Mean Absolute Error (MAE): {mae_xgb}')

XGBoost - Root Mean Squared Error (RMSE): 30.90889727612561
XGBoost - R² Score: 0.6696723697706657
XGBoost - Mean Absolute Error (MAE): 23.087654748938718


# POLYNOMIAL REGRESSION

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_reduced.columns[2:-1]
X_train_poly = train_data_reduced[features]
y_train_poly = train_data_reduced['RUL']

# Generate polynomial features (degree 2 for this example)
poly = PolynomialFeatures(degree=2)
X_train_poly_transformed = poly.fit_transform(X_train_poly)

# Train the Polynomial Regression model
model_poly = LinearRegression()
model_poly.fit(X_train_poly_transformed, y_train_poly)

# Extract the last cycle for each engine in the test set
last_cycle_indices_poly = test_data_reduced.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_poly = test_data_reduced.loc[last_cycle_indices_poly, features]

# Transform test data to polynomial features
X_test_last_cycles_poly_transformed = poly.transform(X_test_last_cycles_poly)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_poly = model_poly.predict(X_test_last_cycles_poly_transformed)

# True RUL values from the provided RUL file
true_rul_poly = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_poly = mean_squared_error(true_rul_poly, y_pred_last_cycles_poly)
rmse_poly = np.sqrt(mse_poly)
mae_poly = mean_absolute_error(true_rul_poly, y_pred_last_cycles_poly)
r2_poly = r2_score(true_rul_poly, y_pred_last_cycles_poly)

print(f'Polynomial Regression - Root Mean Squared Error (RMSE) -> degree-2 : {rmse_poly}')
print(f'Polynomial Regression - R² Score -> degree-2 : {r2_poly}')
print(f'Polynomial Regression - Mean Absolute Error (MAE) -> degree-2 : {mae_poly}')

Polynomial Regression - Root Mean Squared Error (RMSE) -> degree-2 : 31.858119105509672
Polynomial Regression - R² Score -> degree-2 : 0.6490719020164502
Polynomial Regression - Mean Absolute Error (MAE) -> degree-2 : 24.11435609463329


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features3 = train_data_reduced.columns[2:-1]
X_train_poly3 = train_data_reduced[features3]
y_train_poly3 = train_data_reduced['RUL']

# Generate polynomial features (degree 3 for this example)
poly3 = PolynomialFeatures(degree=3)
X_train_poly_transformed3 = poly3.fit_transform(X_train_poly3)

# Train the Polynomial Regression model
model_poly3 = LinearRegression()
model_poly3.fit(X_train_poly_transformed3, y_train_poly3)

# Extract the last cycle for each engine in the test set
last_cycle_indices_poly3 = test_data_reduced.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_poly3 = test_data_reduced.loc[last_cycle_indices_poly3, features3]

# Transform test data to polynomial features
X_test_last_cycles_poly_transformed3 = poly3.transform(X_test_last_cycles_poly3)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_poly3 = model_poly3.predict(X_test_last_cycles_poly_transformed3)

# True RUL values from the provided RUL file
true_rul_poly3 = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_poly3 = mean_squared_error(true_rul_poly3, y_pred_last_cycles_poly3)
rmse_poly3 = np.sqrt(mse_poly3)
r2_poly3 = r2_score(true_rul_poly3, y_pred_last_cycles_poly3)
mae_poly3 = mean_absolute_error(true_rul_poly3, y_pred_last_cycles_poly3)

print(f'Polynomial Regression - Root Mean Squared Error (RMSE)-> degree-3 : {rmse_poly3}')
print(f'Polynomial Regression - R² Score-> degree-3 : {r2_poly3}')
print(f'Polynomial Regression - Mean Absolute Error (MAE) -> degree-3 : {mae_poly3}')

Polynomial Regression - Root Mean Squared Error (RMSE)-> degree-3 : 31.860607503460535
Polynomial Regression - R² Score-> degree-3 : 0.6490170787633124
Polynomial Regression - Mean Absolute Error (MAE) -> degree-3 : 23.742218351272083
