In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# DATASET INITIALIZATION

In [2]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/train_FD002.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/test_FD002.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/content/drive/MyDrive/turbofan dataset/RUL_FD002.txt', delim_whitespace=True, header=None)

In [3]:
column_names = ["engine_id" , "time_in_cycles" , "altitude" , "mach_no" , "throttle_angle" , "fan_inlet_temp" , "LPC_outlet_temp" , "HPC_outlet_temp" , "LPT_outlet_temp" , "fan_inlet_pressure" , "bypass_duct_pressure" , "HPC_outlet_pressure" , "fan_speed" , "core_speed" , "engine_pressure_ratio" , "HPC_outlet_static_pressure" , "fuel_ps30_ratio" , "corrected_fan_speed" , "corrected_core_speed" , "bypass_ratio" , "burner_fuel_air_ratio" , "bleed_enthalpy" , "demanded_fan_speed" , "demanded_corrected_fan_speed" , "HPT_coolant_bleed" , "LPT_coolant_bleed"]
train_data.columns = column_names
test_data.columns = column_names

In [4]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

In [5]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine_id,53759.0,131.082981,74.463862,1.0,68.0,131.0,195.0,260.0
time_in_cycles,53759.0,109.154746,69.180569,1.0,52.0,104.0,157.0,378.0
altitude,53759.0,23.998407,14.747376,0.0,10.0046,25.0013,41.998,42.008
mach_no,53759.0,0.572056,0.310016,0.0,0.2507,0.7,0.84,0.842
throttle_angle,53759.0,94.04602,14.237735,60.0,100.0,100.0,100.0,100.0
fan_inlet_temp,53759.0,472.910207,26.389707,445.0,445.0,462.54,491.19,518.67
LPC_outlet_temp,53759.0,579.672399,37.289399,535.53,549.57,555.98,607.34,644.52
HPC_outlet_temp,53759.0,1419.971013,105.946341,1243.73,1352.76,1369.18,1499.37,1612.88
LPT_outlet_temp,53759.0,1205.442024,119.123428,1023.77,1123.655,1138.89,1306.85,1439.23
fan_inlet_pressure,53759.0,8.031986,3.613839,3.91,3.91,7.05,10.52,14.62


# REMOVING OUTLIERS

In [None]:
# Function to remove outliers using IQR
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# List of columns to check for outliers
columns_to_check = ["altitude", "mach_no", "throttle_angle", "fan_inlet_temp", "LPC_outlet_temp",
                    "HPC_outlet_temp", "LPT_outlet_temp", "fan_inlet_pressure", "bypass_duct_pressure",
                    "HPC_outlet_pressure", "fan_speed", "core_speed", "engine_pressure_ratio",
                    "HPC_outlet_static_pressure", "fuel_ps30_ratio", "corrected_fan_speed",
                    "corrected_core_speed", "bypass_ratio", "burner_fuel_air_ratio", "bleed_enthalpy",
                    "demanded_fan_speed", "demanded_corrected_fan_speed", "HPT_coolant_bleed",
                    "LPT_coolant_bleed"]

# Remove outliers from train and test data
train_data_cleaned = remove_outliers(train_data, columns_to_check)
test_data_cleaned = remove_outliers(test_data, columns_to_check)

# Check the shape of the datasets before and after removing outliers
print("Train data before removing outliers:", train_data.shape)
print("Train data after removing outliers:", train_data_cleaned.shape)
print("Test data before removing outliers:", test_data.shape)
print("Test data after removing outliers:", test_data_cleaned.shape)

Train data before removing outliers: (53759, 27)
Train data after removing outliers: (41575, 27)
Test data before removing outliers: (33991, 26)
Test data after removing outliers: (26597, 26)


# SCALING

In [None]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
train_data_cleaned.iloc[:, 2:-1] = scaler.fit_transform(train_data_cleaned.iloc[:, 2:-1])
test_data_cleaned.iloc[:, 2:] = scaler.transform(test_data_cleaned.iloc[:, 2:])

# LINEAR REGRESSION

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_lr = train_data_cleaned[features]
y_train_lr = train_data_cleaned['RUL']

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train_lr, y_train_lr)

# Extract the last cycle for each engine in the test set
last_cycle_indices_lr = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_lr = test_data_cleaned.loc[last_cycle_indices_lr, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_lr = model_lr.predict(X_test_last_cycles_lr)

# True RUL values from the provided RUL file
true_rul_lr = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_lr = mean_squared_error(true_rul_lr, y_pred_last_cycles_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(true_rul_lr, y_pred_last_cycles_lr)
mae_lr = mean_absolute_error(true_rul_lr, y_pred_last_cycles_lr)

print(f'Linear Regression - Root Mean Squared Error (RMSE): {rmse_lr}')
print(f'Linear Regression - R² Score: {r2_lr}')
print(f'Linear Regression - Mean Absolute Error (MAE): {mae_lr}')

Linear Regression - Root Mean Squared Error (RMSE): 34.482291999470625
Linear Regression - R² Score: 0.588878556341539
Linear Regression - Mean Absolute Error (MAE): 28.107209230677974


# RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_rf = train_data_cleaned[features]
y_train_rf = train_data_cleaned['RUL']

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Extract the last cycle for each engine in the test set
last_cycle_indices_rf = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_rf = test_data_cleaned.loc[last_cycle_indices_rf, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_rf = rf_model.predict(X_test_last_cycles_rf)

# True RUL values from the provided RUL file
true_rul_rf = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)
mae_rf = mean_absolute_error(true_rul_rf, y_pred_last_cycles_rf)

print(f'Random Forest - Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'Random Forest - R² Score: {r2_rf}')
print(f'Random Forest - Mean Absolute Error (MAE): {mae_rf}')

Random Forest - Root Mean Squared Error (RMSE): 31.070951473725835
Random Forest - R² Score: 0.6661994985745447
Random Forest - Mean Absolute Error (MAE): 22.795057915057914


# Support Vector Regressor (SVR) model

In [None]:
from sklearn.svm import SVR

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train = train_data_cleaned[features]
y_train = train_data_cleaned['RUL']

# Train the SVR model
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Extract the last cycle for each engine in the test set
last_cycle_indices = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles = test_data_cleaned.loc[last_cycle_indices, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles = svr_model.predict(X_test_last_cycles)

# True RUL values from the provided RUL file
true_rul = rul_data.values.flatten()

# Make predictions for the training set
y_train_pred = svr_model.predict(X_train)

# Calculate metrics for the training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Calculate metrics for the test set
test_mse = mean_squared_error(true_rul, y_pred_last_cycles)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(true_rul, y_pred_last_cycles)
test_r2 = r2_score(true_rul, y_pred_last_cycles)

# Print training metrics
print(f'Training Root Mean Squared Error (RMSE): {train_rmse}')
print(f'Training Mean Absolute Error (MAE): {train_mae}')
print(f'Training R² Score: {train_r2}')

# Print testing metrics
print(f'Testing Root Mean Squared Error (RMSE): {test_rmse}')
print(f'Testing Mean Absolute Error (MAE): {test_mae}')
print(f'Testing R² Score: {test_r2}')

Training Root Mean Squared Error (RMSE): 45.41083859067471
Training Mean Absolute Error (MAE): 32.761686423236554
Training R² Score: 0.5543984229030667
Testing Root Mean Squared Error (RMSE): 30.260517374244355
Testing Mean Absolute Error (MAE): 21.48099538877779
Testing R² Score: 0.6833856625505517


# XGBOOST

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(X_train_lr, y_train_lr)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_lr)

# Calculate RMSE and R² Score
mse_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
mae_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

print(f'XGBoost - Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'XGBoost - R² Score: {r2_xgb}')
print(f'XGBoost - Mean Absolute Error (MAE): {mae_xgb}')

XGBoost - Root Mean Squared Error (RMSE): 31.39120568911112
XGBoost - R² Score: 0.6592829455139659
XGBoost - Mean Absolute Error (MAE): 23.234131834739422


# POLYNOMIAL REGRESSION

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_poly = train_data_cleaned[features]
y_train_poly = train_data_cleaned['RUL']

# Generate polynomial features (degree 2 for this example)
poly = PolynomialFeatures(degree=2)
X_train_poly_transformed = poly.fit_transform(X_train_poly)

# Train the Polynomial Regression model
model_poly = LinearRegression()
model_poly.fit(X_train_poly_transformed, y_train_poly)

# Extract the last cycle for each engine in the test set
last_cycle_indices_poly = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_poly = test_data_cleaned.loc[last_cycle_indices_poly, features]

# Transform test data to polynomial features
X_test_last_cycles_poly_transformed = poly.transform(X_test_last_cycles_poly)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_poly = model_poly.predict(X_test_last_cycles_poly_transformed)

# True RUL values from the provided RUL file
true_rul_poly = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_poly = mean_squared_error(true_rul_poly, y_pred_last_cycles_poly)
rmse_poly = np.sqrt(mse_poly)
mae_poly = mean_absolute_error(true_rul_poly, y_pred_last_cycles_poly)
r2_poly = r2_score(true_rul_poly, y_pred_last_cycles_poly)

print(f'Polynomial Regression - Root Mean Squared Error (RMSE) -> degree-2 : {rmse_poly}')
print(f'Polynomial Regression - R² Score -> degree-2 : {r2_poly}')
print(f'Polynomial Regression - Mean Absolute Error (MAE) -> degree-2 : {mae_poly}')

Polynomial Regression - Root Mean Squared Error (RMSE) -> degree-2 : 32.16514841863376
Polynomial Regression - R² Score -> degree-2 : 0.6422752418514863
Polynomial Regression - Mean Absolute Error (MAE) -> degree-2 : 23.970641533236478


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features3 = train_data_cleaned.columns[2:-1]
X_train_poly3 = train_data_cleaned[features3]
y_train_poly3 = train_data_cleaned['RUL']

# Generate polynomial features (degree 3 for this example)
poly3 = PolynomialFeatures(degree=3)
X_train_poly_transformed3 = poly3.fit_transform(X_train_poly3)

# Train the Polynomial Regression model
model_poly3 = LinearRegression()
model_poly3.fit(X_train_poly_transformed3, y_train_poly3)

# Extract the last cycle for each engine in the test set
last_cycle_indices_poly3 = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_poly3 = test_data_cleaned.loc[last_cycle_indices_poly3, features3]

# Transform test data to polynomial features
X_test_last_cycles_poly_transformed3 = poly3.transform(X_test_last_cycles_poly3)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_poly3 = model_poly3.predict(X_test_last_cycles_poly_transformed3)

# True RUL values from the provided RUL file
true_rul_poly3 = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_poly3 = mean_squared_error(true_rul_poly3, y_pred_last_cycles_poly3)
rmse_poly3 = np.sqrt(mse_poly3)
r2_poly3 = r2_score(true_rul_poly3, y_pred_last_cycles_poly3)
mae_poly3 = mean_absolute_error(true_rul_poly3, y_pred_last_cycles_poly3)

print(f'Polynomial Regression - Root Mean Squared Error (RMSE)-> degree-3 : {rmse_poly3}')
print(f'Polynomial Regression - R² Score-> degree-3 : {r2_poly3}')
print(f'Polynomial Regression - Mean Absolute Error (MAE) -> degree-3 : {mae_poly3}')

Polynomial Regression - Root Mean Squared Error (RMSE)-> degree-3 : 32.25978802788874
Polynomial Regression - R² Score-> degree-3 : 0.6401670757941166
Polynomial Regression - Mean Absolute Error (MAE) -> degree-3 : 24.564060166995958
