In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# DATASET INITIALIZATION

In [2]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv('/content/drive/MyDrive/CMAPSSData/train_FD001.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/content/drive/MyDrive/CMAPSSData/test_FD001.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/content/drive/MyDrive/CMAPSSData/RUL_FD001.txt', delim_whitespace=True, header=None)

In [3]:
train_data.shape

(20631, 26)

In [4]:
column_names = ['engine_id', 'time_in_cycles', 'altitude', 'mach_no',
                'throttle_angle', 'fan_inlet_temp', 'lpc_outlet_temp',
                'hpc_outlet_temp', 'lpt_outlet_temp', 'fan_inlet_pressure',
                'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
                'core_speed', 'engine_pressure_ratio', 'hpc_outlet_static_pressure',
                'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed',
                'bypass_ratio', 'burner_fuel_air_ratio', 'bleed_enthalpy',
                'demanded_fan_speed', 'demanded_corrected_fan_speed',
                'hpt_coolant_bleed', 'lpt_coolant_bleed']

train_data.columns = column_names
test_data.columns = column_names

In [5]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

In [6]:
train_data.columns

Index(['engine_id', 'time_in_cycles', 'altitude', 'mach_no', 'throttle_angle',
       'fan_inlet_temp', 'lpc_outlet_temp', 'hpc_outlet_temp',
       'lpt_outlet_temp', 'fan_inlet_pressure', 'bypass_duct_pressure',
       'hpc_outlet_pressure', 'fan_speed', 'core_speed',
       'engine_pressure_ratio', 'hpc_outlet_static_pressure',
       'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed',
       'bypass_ratio', 'burner_fuel_air_ratio', 'bleed_enthalpy',
       'demanded_fan_speed', 'demanded_corrected_fan_speed',
       'hpt_coolant_bleed', 'lpt_coolant_bleed', 'RUL'],
      dtype='object')

In [7]:
# Your column names
import matplotlib.pyplot as plt
import numpy as np

column_names = ['engine_id', 'time_in_cycles', 'altitude', 'mach_no',
                'throttle_angle', 'fan_inlet_temp', 'lpc_outlet_temp',
                'hpc_outlet_temp', 'lpt_outlet_temp', 'fan_inlet_pressure',
                'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
                'core_speed', 'engine_pressure_ratio', 'hpc_outlet_static_pressure',
                'fuel_ps30_ratio', 'corrected_fan_speed', 'corrected_core_speed',
                'bypass_ratio', 'burner_fuel_air_ratio', 'bleed_enthalpy',
                'demanded_fan_speed', 'demanded_corrected_fan_speed',
                'hpt_coolant_bleed', 'lpt_coolant_bleed', 'RUL']

# Define the Sensor dictionary with the column names
Sensor_dictionary = {
    'fan_inlet_temp': "(Fan inlet temperature) (◦R)",
    'lpc_outlet_temp': "(LPC outlet temperature) (◦R)",
    'hpc_outlet_temp': "(HPC outlet temperature) (◦R)",
    'lpt_outlet_temp': "(LPT outlet temperature) (◦R)",
    'fan_inlet_pressure': "(Fan inlet Pressure) (psia)",
    'bypass_duct_pressure': "(bypass-duct pressure) (psia)",
    'hpc_outlet_pressure': "(HPC outlet pressure) (psia)",
    'fan_speed': "(Physical fan speed) (rpm)",
    'core_speed': "(Physical core speed) (rpm)",
    'engine_pressure_ratio': "(Engine pressure ratio(P50/P2))",
    'hpc_outlet_static_pressure': "(HPC outlet Static pressure) (psia)",
    'fuel_ps30_ratio': "(Ratio of fuel flow to Ps30) (pps/psia)",
    'corrected_fan_speed': "(Corrected fan speed) (rpm)",
    'corrected_core_speed': "(Corrected core speed) (rpm)",
    'bypass_ratio': "(Bypass Ratio)",
    'burner_fuel_air_ratio': "(Burner fuel-air ratio)",
    'bleed_enthalpy': "(Bleed Enthalpy)",
    'demanded_fan_speed': "(Required fan speed)",
    'demanded_corrected_fan_speed': "(Required fan conversion speed)",
    'hpt_coolant_bleed': "(High-pressure turbines Cool air flow)",
    'lpt_coolant_bleed': "(Low-pressure turbines Cool air flow)"
}

def plot_signal(df, Sensor_dic, signal_name):
    plt.figure(figsize=(13, 5))
    for i in df['engine_id'].unique():
        if (i % 10 == 0):  # For a better visualisation, we plot the sensors signals of 20 units only
            plt.plot('RUL', signal_name, data=df[df['engine_id'] == i].rolling(10).mean())

    plt.xlim(250, 0)  # Reverse the x-axis so RUL counts down to zero
    plt.xticks(np.arange(0, 300, 25))
    plt.ylabel(Sensor_dic[signal_name])
    plt.xlabel('Remaining Useful Life')
    plt.show()

# Assuming your data is in a DataFrame named train
for signal in Sensor_dictionary.keys():
    try:
        plot_signal(train_data, Sensor_dictionary, signal)
    except Exception as e:
        print(f"Could not plot signal {signal}: {e}")

Output hidden; open in https://colab.research.google.com to view.

In [8]:
# matrix = train_data.corr()
# print(matrix)

In [9]:
dropping = ['fan_inlet_temp', 'fan_inlet_pressure', 'engine_pressure_ratio', 'burner_fuel_air_ratio', 'demanded_fan_speed', 'demanded_corrected_fan_speed']

In [10]:
train_data = train_data.drop(dropping, axis=1)

In [11]:
train_data.shape

(20631, 21)

In [12]:
test_data = test_data.drop(dropping, axis=1)

# REMOVING OUTLIERS

In [13]:
from scipy import stats
import numpy as np

# Calculate Z-scores for sensor measurements and operational settings
z_scores = np.abs(stats.zscore(train_data.iloc[:, 2:-1]))

# Set a threshold for Z-score
threshold = 3

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Drop outliers
train_data_cleaned = train_data[~outliers]


# SCALING

In [14]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
train_data_cleaned.iloc[:, 2:-1] = scaler.fit_transform(train_data_cleaned.iloc[:, 2:-1])
test_data.iloc[:, 2:] = scaler.transform(test_data.iloc[:, 2:])

# SPLITTING

In [15]:
# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[features]
# y_train = train_data_cleaned['RUL']

# LINEAR REGRESSION

In [16]:
# from sklearn.linear_model import LinearRegression

# model = LinearRegression()
# model.fit(X_train, y_train)

In [17]:
# X_test = test_data[features]
# y_pred = model.predict(X_test)

In [18]:
# import numpy as np

# # Extract the last cycle for each engine in the test set
# last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles = test_data.loc[last_cycle_indices, features]

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles = model.predict(X_test_last_cycles)

# # True RUL values from the provided RUL file
# true_rul = rul_data.values.flatten()

In [19]:
# # Calculate Mean Squared Error
# from sklearn.metrics import mean_squared_error, r2_score

# mse = mean_squared_error(true_rul, y_pred_last_cycles)
# print(f'Training Set - Mean Squared Error: {mse}')

# # Calculate Mean Squared Error
# mse = mean_squared_error(true_rul, y_pred_last_cycles)

# # Calculate Root Mean Squared Error (RMSE)
# rmse = np.sqrt(mse)

# # Calculate R² Score
# r2 = r2_score(true_rul, y_pred_last_cycles)

# print(f'Testing Set - Root Mean Squared Error (RMSE): {rmse}')
# print(f'Testing Set - R² Score: {r2}')


In [20]:
train_data_cleaned.columns

Index(['engine_id', 'time_in_cycles', 'altitude', 'mach_no', 'throttle_angle',
       'lpc_outlet_temp', 'hpc_outlet_temp', 'lpt_outlet_temp',
       'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
       'core_speed', 'hpc_outlet_static_pressure', 'fuel_ps30_ratio',
       'corrected_fan_speed', 'corrected_core_speed', 'bypass_ratio',
       'bleed_enthalpy', 'hpt_coolant_bleed', 'lpt_coolant_bleed', 'RUL'],
      dtype='object')

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_lr = train_data_cleaned[features]
y_train_lr = train_data_cleaned['RUL']

# Train the Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train_lr, y_train_lr)

# Extract the last cycle for each engine in the test set
last_cycle_indices_lr = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_lr = test_data.loc[last_cycle_indices_lr, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_lr = model_lr.predict(X_test_last_cycles_lr)

# True RUL values from the provided RUL file
true_rul_lr = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_lr = mean_squared_error(true_rul_lr, y_pred_last_cycles_lr)
mae_lr = mean_absolute_error(true_rul_lr, y_pred_last_cycles_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(true_rul_lr, y_pred_last_cycles_lr)

print(f'Linear Regression - Root Mean Squared Error (RMSE): {rmse_lr}')
print(f'Linear Regression - R² Score: {r2_lr}')
print(f'Linear Regression - Mean Absolute Error (MAE) : {mae_lr}')

Linear Regression - Root Mean Squared Error (RMSE): 32.75020198402294
Linear Regression - R² Score: 0.3788901431849282
Linear Regression - Mean Absolute Error (MAE) : 26.4359432332933


# RANDOM FOREST

In [22]:
# from sklearn.ensemble import RandomForestRegressor

# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[features]
# y_train = train_data_cleaned['RUL']

# # Train the Random Forest Regressor model
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

In [23]:
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Extract the last cycle for each engine in the test set
# last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles = test_data.loc[last_cycle_indices, features]

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles = rf_model.predict(X_test_last_cycles)

# # True RUL values from the provided RUL file
# true_rul = rul_data.values.flatten()

# # Calculate RMSE and R² Score
# mse = mean_squared_error(true_rul, y_pred_last_cycles)
# rmse = np.sqrt(mse)
# r2 = r2_score(true_rul, y_pred_last_cycles)

# print(f'Root Mean Squared Error (RMSE): {rmse}')
# print(f'R² Score: {r2}')

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_rf = train_data_cleaned[features]
y_train_rf = train_data_cleaned['RUL']

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Extract the last cycle for each engine in the test set
last_cycle_indices_rf = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_rf = test_data.loc[last_cycle_indices_rf, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_rf = rf_model.predict(X_test_last_cycles_rf)

# True RUL values from the provided RUL file
true_rul_rf = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)
mae_rf = mean_absolute_error(true_rul_rf, y_pred_last_cycles_rf)


print(f'Random Forest - Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'Random Forest - R² Score: {r2_rf}')
print(f'Random Forest - Mean Absolute Error (MAE) : {mae_rf}')

Random Forest - Root Mean Squared Error (RMSE): 33.90076686448258
Random Forest - R² Score: 0.3344824681608848
Random Forest - Mean Absolute Error (MAE) : 25.132199999999994


# Support Vector Regressor (SVR) model

In [25]:
# from sklearn.svm import SVR

# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[features]
# y_train = train_data_cleaned['RUL']

# # Train the SVR model
# svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
# svr_model.fit(X_train, y_train)

In [26]:
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Extract the last cycle for each engine in the test set
# last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles = test_data.loc[last_cycle_indices, features]

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles = svr_model.predict(X_test_last_cycles)

# # True RUL values from the provided RUL file
# true_rul = rul_data.values.flatten()

# # Calculate RMSE and R² Score
# mse = mean_squared_error(true_rul, y_pred_last_cycles)
# rmse = np.sqrt(mse)
# r2 = r2_score(true_rul, y_pred_last_cycles)

# print(f'Root Mean Squared Error (RMSE): {rmse}')
# print(f'R² Score: {r2}')

In [27]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_svr = train_data_cleaned[features]
y_train_svr = train_data_cleaned['RUL']

# Train the SVR model
svr_model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_model.fit(X_train_svr, y_train_svr)

# Extract the last cycle for each engine in the test set
last_cycle_indices_svr = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_svr = test_data.loc[last_cycle_indices_svr, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_svr = svr_model.predict(X_test_last_cycles_svr)

# True RUL values from the provided RUL file
true_rul_svr = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_svr = mean_squared_error(true_rul_svr, y_pred_last_cycles_svr)
rmse_svr = np.sqrt(mse_svr)
mae_svr = mean_absolute_error(true_rul_svr, y_pred_last_cycles_svr)
r2_svr = r2_score(true_rul_svr, y_pred_last_cycles_svr)

print(f'SVR - Root Mean Squared Error (RMSE): {rmse_svr}')
print(f'SVR - R² Score: {r2_svr}')
print(f'SVR - Mean Absolute Error (MAE) : {mae_svr}')

SVR - Root Mean Squared Error (RMSE): 29.67212937512941
SVR - R² Score: 0.4901553298208401
SVR - Mean Absolute Error (MAE) : 21.23657448494454


# DECISION TREE

In [28]:
from sklearn.tree import DecisionTreeRegressor

# Train the Decision Tree Regressor model
model_dt = DecisionTreeRegressor()
model_dt.fit(X_train_lr, y_train_lr)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_dt = model_dt.predict(X_test_last_cycles_lr)

# Calculate RMSE and R² Score
mse_dt = mean_squared_error(true_rul_lr, y_pred_last_cycles_dt)
rmse_dt = np.sqrt(mse_dt)
r2_dt = r2_score(true_rul_lr, y_pred_last_cycles_dt)
mae_dt = mean_absolute_error(true_rul_lr, y_pred_last_cycles_dt)

print(f'Decision Tree Regressor - Root Mean Squared Error (RMSE): {rmse_dt}')
print(f'Decision Tree Regressor - R² Score: {r2_dt}')
print(f'Decision Tree Regressor - Mean Absolute Error (MAE): {mae_dt}')

Decision Tree Regressor - Root Mean Squared Error (RMSE): 46.66519045284183
Decision Tree Regressor - R² Score: -0.26103325925709764
Decision Tree Regressor - Mean Absolute Error (MAE): 32.6


# XGBOOST

In [29]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Train the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)
model_xgb.fit(X_train_lr, y_train_lr)

# Make predictions for the last cycles of each engine
y_pred_last_cycles_xgb = model_xgb.predict(X_test_last_cycles_lr)

# Calculate RMSE and R² Score
mse_xgb = mean_squared_error(true_rul_lr, y_pred_last_cycles_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(true_rul_lr, y_pred_last_cycles_xgb)
mae_xgb = mean_absolute_error(true_rul_lr, y_pred_last_cycles_xgb)

print(f'XGBoost - Root Mean Squared Error (RMSE): {rmse_xgb}')
print(f'XGBoost - R² Score: {r2_xgb}')
print(f'XGBoost - Mean Absolute Error (MAE): {mae_xgb}')

XGBoost - Root Mean Squared Error (RMSE): 32.93609282092198
XGBoost - R² Score: 0.3718192674719687
XGBoost - Mean Absolute Error (MAE): 23.648133163452147


In [30]:
train_data.columns

Index(['engine_id', 'time_in_cycles', 'altitude', 'mach_no', 'throttle_angle',
       'lpc_outlet_temp', 'hpc_outlet_temp', 'lpt_outlet_temp',
       'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
       'core_speed', 'hpc_outlet_static_pressure', 'fuel_ps30_ratio',
       'corrected_fan_speed', 'corrected_core_speed', 'bypass_ratio',
       'bleed_enthalpy', 'hpt_coolant_bleed', 'lpt_coolant_bleed', 'RUL'],
      dtype='object')

In [31]:
test_data.columns

Index(['engine_id', 'time_in_cycles', 'altitude', 'mach_no', 'throttle_angle',
       'lpc_outlet_temp', 'hpc_outlet_temp', 'lpt_outlet_temp',
       'bypass_duct_pressure', 'hpc_outlet_pressure', 'fan_speed',
       'core_speed', 'hpc_outlet_static_pressure', 'fuel_ps30_ratio',
       'corrected_fan_speed', 'corrected_core_speed', 'bypass_ratio',
       'bleed_enthalpy', 'hpt_coolant_bleed', 'lpt_coolant_bleed'],
      dtype='object')

# POLYNOMIAL REGRESSION

In [32]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train_poly = train_data_cleaned[features]
# y_train_poly = train_data_cleaned['RUL']

# # Generate polynomial features (degree 2 for this example)
# poly = PolynomialFeatures(degree=2)
# X_train_poly_transformed = poly.fit_transform(X_train_poly)

# # Train the Polynomial Regression model
# model_poly = LinearRegression()
# model_poly.fit(X_train_poly_transformed, y_train_poly)

# # Extract the last cycle for each engine in the test set
# last_cycle_indices_poly = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles_poly = test_data.loc[last_cycle_indices_poly, features]

# # Transform test data to polynomial features
# X_test_last_cycles_poly_transformed = poly.transform(X_test_last_cycles_poly)

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles_poly = model_poly.predict(X_test_last_cycles_poly_transformed)

# # True RUL values from the provided RUL file
# true_rul_poly = rul_data.values.flatten()

# # Calculate RMSE and R² Score
# mse_poly = mean_squared_error(true_rul_poly, y_pred_last_cycles_poly)
# rmse_poly = np.sqrt(mse_poly)
# r2_poly = r2_score(true_rul_poly, y_pred_last_cycles_poly)

# print(f'Polynomial Regression - Root Mean Squared Error (RMSE): {rmse_poly}')
# print(f'Polynomial Regression - R² Score: {r2_poly}')

In [33]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Select features and target
# features3 = train_data_cleaned.columns[2:-1]
# X_train_poly3 = train_data_cleaned[features3]
# y_train_poly3 = train_data_cleaned['RUL']

# # Generate polynomial features (degree 3 for this example)
# poly3 = PolynomialFeatures(degree=3)
# X_train_poly_transformed3 = poly3.fit_transform(X_train_poly3)

# # Train the Polynomial Regression model
# model_poly3 = LinearRegression()
# model_poly3.fit(X_train_poly_transformed3, y_train_poly3)

# # Extract the last cycle for each engine in the test set
# last_cycle_indices_poly3 = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles_poly3 = test_data.loc[last_cycle_indices_poly3, features3]

# # Transform test data to polynomial features
# X_test_last_cycles_poly_transformed3 = poly3.transform(X_test_last_cycles_poly3)

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles_poly3 = model_poly3.predict(X_test_last_cycles_poly_transformed3)

# # True RUL values from the provided RUL file
# true_rul_poly3 = rul_data.values.flatten()

# # Calculate RMSE and R² Score
# mse_poly3 = mean_squared_error(true_rul_poly3, y_pred_last_cycles_poly3)
# rmse_poly3 = np.sqrt(mse_poly3)
# r2_poly3 = r2_score(true_rul_poly3, y_pred_last_cycles_poly3)

# print(f'Polynomial Regression - Root Mean Squared Error (RMSE): {rmse_poly3}')
# print(f'Polynomial Regression - R² Score: {r2_poly3}')

In [34]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Function to train and evaluate polynomial regression models
# def train_and_evaluate_polynomial_model(degree, X_train, y_train, X_test, y_test):
#     # Generate polynomial features
#     poly_transformer = PolynomialFeatures(degree=degree)
#     X_train_transformed = poly_transformer.fit_transform(X_train)
#     X_test_transformed = poly_transformer.transform(X_test)

#     # Train the Polynomial Regression model
#     polynomial_model = LinearRegression()
#     polynomial_model.fit(X_train_transformed, y_train)

#     # Make predictions for the test set
#     y_pred = polynomial_model.predict(X_test_transformed)

#     # Calculate RMSE and R² Score
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = np.sqrt(mse)
#     r2 = r2_score(y_test, y_pred)

#     return rmse, r2

# # Select features and target
# feature_columns = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[feature_columns]
# y_train = train_data_cleaned['RUL']

# # Extract the last cycle for each engine in the test set
# last_cycle_indices = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles = test_data.loc[last_cycle_indices, feature_columns]

# # True RUL values from the provided RUL file
# true_rul = rul_data.values.flatten()

# # Train and evaluate polynomial regression models of degrees 4, 5, and 6
# degrees = [4, 5, 6]
# results = {}

# for degree in degrees:
#     rmse, r2 = train_and_evaluate_polynomial_model(degree, X_train, y_train, X_test_last_cycles, true_rul)
#     results[degree] = {'RMSE': rmse, 'R2': r2}

# # Print results
# for degree, metrics in results.items():
#     print(f'Polynomial Regression (Degree {degree}) - Root Mean Squared Error (RMSE): {metrics["RMSE"]}')
#     print(f'Polynomial Regression (Degree {degree}) - R² Score: {metrics["R2"]}')


# FEATURE SELECTION

## Recursive Feature Elimination (RFE)

In [35]:
# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train_rf = train_data_cleaned[features]
# y_train_rf = train_data_cleaned['RUL']

# # Extract the last cycle for each engine in the test set
# last_cycle_indices_rf = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
# X_test_last_cycles_rf = test_data.loc[last_cycle_indices_rf, features]

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.feature_selection import RFE

# # Initialize the model for RFE
# rf_model_for_rfe = RandomForestRegressor(n_estimators=100, random_state=42)

# # Initialize the model for RFE
# rf_model_for_rfe = RandomForestRegressor(n_estimators=100, random_state=42)

# # Initialize RFE
# rfe = RFE(estimator=rf_model_for_rfe, n_features_to_select=10, step=1)

# # Fit RFE on the training data
# rfe.fit(X_train_rf, y_train_rf)

# # Transform training and test sets
# X_train_rf_selected_rfe = rfe.transform(X_train_rf)
# X_test_last_cycles_rf_selected_rfe = rfe.transform(X_test_last_cycles_rf)


In [36]:
# # Print selected features
# selected_features = X_train_rf.columns[rfe.support_]
# print("Selected Features:")
# print(selected_features)

In [37]:
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Train Linear Regression with selected features from RFE
# model_lr_rfe = LinearRegression()
# model_lr_rfe.fit(X_train_rf_selected_rfe, y_train_rf)
# y_pred_lr_rfe = model_lr_rfe.predict(X_test_last_cycles_rf_selected_rfe)

# # Evaluate Linear Regression with RFE selected features
# rmse_lr_rfe = np.sqrt(mean_squared_error(true_rul_rf, y_pred_lr_rfe))
# r2_lr_rfe = r2_score(true_rul_rf, y_pred_lr_rfe)
# print(f'Linear Regression with RFE selected features - RMSE: {rmse_lr_rfe}')
# print(f'Linear Regression with RFE selected features - R² Score: {r2_lr_rfe}')

In [38]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score

# # Train the Random Forest Regressor model with selected features from RFE
# rf_model_selected = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model_selected.fit(X_train_rf_selected_rfe, y_train_rf)

# # Make predictions for the last cycles of each engine using selected features
# y_pred_last_cycles_rf_selected = rf_model_selected.predict(X_test_last_cycles_rf_selected_rfe)

# # Calculate RMSE and R² Score with selected features
# mse_rf_selected = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf_selected)
# rmse_rf_selected = np.sqrt(mse_rf_selected)
# r2_rf_selected = r2_score(true_rul_rf, y_pred_last_cycles_rf_selected)

# print(f'Random Forest with Selected Features - Root Mean Squared Error (RMSE): {rmse_rf_selected}')
# print(f'Random Forest with Selected Features - R² Score: {r2_rf_selected}')

## PCA

In [39]:
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

# # Standardize the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_rf)
# X_test_scaled = scaler.transform(X_test_last_cycles_rf)

# # Apply PCA
# pca = PCA(n_components=10)  # Specify the number of principal components (features) to retain
# X_train_pca = pca.fit_transform(X_train_scaled)
# X_test_pca = pca.transform(X_test_scaled)

# # Print explained variance ratio
# print("Explained variance ratio:", pca.explained_variance_ratio_)

# # # Optionally, print the components (principal axes in feature space)
# # print("Principal components (eigenvectors):")
# # print(pca.components_)

In [40]:
# feature_names = X_train_rf.columns

# print("Principal components (eigenvectors):")
# for i, component in enumerate(pca.components_):
#     print(f"Component {i+1}:")
#     print(f"Explained variance: {pca.explained_variance_ratio_[i]:.4f}")
#     print("Top 5 features contributing to this component:")
#     top_features_indices = component.argsort()[-5:][::-1]  # Top 5 features contributing to the component
#     top_features = feature_names[top_features_indices]
#     print(top_features)
#     print()

In [41]:
# # Train Linear Regression with PCA transformed features
# model_lr_pca = LinearRegression()
# model_lr_pca.fit(X_train_pca, y_train_rf)  # Use y_train_rf or appropriate target variable

# # Transform test set features with PCA
# X_test_pca_transformed = pca.transform(X_test_last_cycles_rf)

# # Predict with PCA transformed features
# y_pred_lr_pca = model_lr_pca.predict(X_test_pca_transformed)

# # Evaluate Linear Regression with PCA transformed features
# rmse_lr_pca = np.sqrt(mean_squared_error(true_rul_rf, y_pred_lr_pca))
# r2_lr_pca = r2_score(true_rul_rf, y_pred_lr_pca)
# print(f'Linear Regression with PCA transformed features - RMSE: {rmse_lr_pca}')
# print(f'Linear Regression with PCA transformed features - R² Score: {r2_lr_pca}')

# # Similarly, repeat for Random Forest and SVR with PCA transformed features

In [42]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.svm import SVR
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# # Train Random Forest with PCA transformed features
# model_rf_pca = RandomForestRegressor(n_estimators=100, random_state=42)
# model_rf_pca.fit(X_train_pca, y_train_rf)

# # Predict with PCA transformed features using Random Forest
# y_pred_rf_pca = model_rf_pca.predict(X_test_pca_transformed)

# # Evaluate Random Forest with PCA transformed features
# rmse_rf_pca = np.sqrt(mean_squared_error(true_rul_rf, y_pred_rf_pca))
# r2_rf_pca = r2_score(true_rul_rf, y_pred_rf_pca)
# print(f'Random Forest with PCA transformed features - RMSE: {rmse_rf_pca}')
# print(f'Random Forest with PCA transformed features - R² Score: {r2_rf_pca}')

In [43]:
# # Train SVR with PCA transformed features
# model_svr_pca = SVR(kernel='rbf', C=1.0, epsilon=0.2)
# model_svr_pca.fit(X_train_pca, y_train_rf)

# # Predict with PCA transformed features using SVR
# y_pred_svr_pca = model_svr_pca.predict(X_test_pca_transformed)

# # Evaluate SVR with PCA transformed features
# rmse_svr_pca = np.sqrt(mean_squared_error(true_rul_rf, y_pred_svr_pca))
# r2_svr_pca = r2_score(true_rul_rf, y_pred_svr_pca)
# print(f'SVR with PCA transformed features - RMSE: {rmse_svr_pca}')
# print(f'SVR with PCA transformed features - R² Score: {r2_svr_pca}')


# RANDOM FOREST FEATURE SELECTION

In [44]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# import numpy as np

# # Train the Random Forest Regressor model
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train_rf, y_train_rf)

# # Get feature importances
# importances = rf_model.feature_importances_

# # Create a DataFrame for better visualization
# feature_importances = pd.DataFrame({
#     'feature': features,
#     'importance': importances
# })

# # Sort the features by their importance scores
# feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# print(feature_importances)

In [45]:
# # Select the top 10 most important features
# top_n = 10
# selected_features = feature_importances.head(top_n)['feature'].values

# print("Selected features:", selected_features)

In [46]:
# # Select features from the training and test sets
# X_train_selected = X_train_rf[selected_features]
# X_test_selected = X_test_last_cycles_rf[selected_features]

# # Linear Regression with selected features
# model_lr_selected = LinearRegression()
# model_lr_selected.fit(X_train_selected, y_train_rf)

# y_pred_last_cycles_lr_selected = model_lr_selected.predict(X_test_selected)

# mse_lr_selected = mean_squared_error(true_rul_lr, y_pred_last_cycles_lr_selected)
# rmse_lr_selected = np.sqrt(mse_lr_selected)
# r2_lr_selected = r2_score(true_rul_lr, y_pred_last_cycles_lr_selected)

# print(f'\nLinear Regression with selected features - RMSE: {rmse_lr_selected}')
# print(f'\nLinear Regression with selected features - R² Score: {r2_lr_selected}')

# # Random Forest with selected features
# rf_model_selected = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model_selected.fit(X_train_selected, y_train_rf)

# y_pred_last_cycles_rf_selected = rf_model_selected.predict(X_test_selected)

# mse_rf_selected = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf_selected)
# rmse_rf_selected = np.sqrt(mse_rf_selected)
# r2_rf_selected = r2_score(true_rul_rf, y_pred_last_cycles_rf_selected)

# print(f'\nRandom Forest with selected features - RMSE: {rmse_rf_selected}')
# print(f'\nRandom Forest with selected features - R² Score: {r2_rf_selected}')

# # SVR with selected features
# svr_model_selected = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
# svr_model_selected.fit(X_train_selected, y_train_rf)

# y_pred_last_cycles_svr_selected = svr_model_selected.predict(X_test_selected)

# mse_svr_selected = mean_squared_error(true_rul_svr, y_pred_last_cycles_svr_selected)
# rmse_svr_selected = np.sqrt(mse_svr_selected)
# r2_svr_selected = r2_score(true_rul_svr, y_pred_last_cycles_svr_selected)

# print(f'\nSVR with selected features - RMSE: {rmse_svr_selected}')
# print(f'\nSVR with selected features - R² Score: {r2_svr_selected}')

# FEATURE ELIMINATION USING BFE

In [47]:
# from sklearn.feature_selection import RFECV
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score

# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[features]
# y_train = train_data_cleaned['RUL']

# # Define the model
# model = LinearRegression()

# # Define the RFECV (Recursive Feature Elimination with Cross-Validation) with the model
# rfecv = RFECV(estimator=model, step=1, cv=5, scoring='neg_mean_squared_error')

# # Fit the RFECV
# rfecv.fit(X_train, y_train)

# # Get the optimal number of features
# optimal_num_features = rfecv.n_features_

# # Get the ranking of features
# ranking = rfecv.ranking_

# # Get the support (selected features)
# support = rfecv.support_

# # Get the selected features
# selected_features = features[support]

# print(f'Optimal number of features: {optimal_num_features}')
# print(f'Selected features: {selected_features.tolist()}')

# # Train the model with selected features
# X_train_selected = X_train[selected_features]
# model.fit(X_train_selected, y_train)

# # Extract the last cycle for each engine in the test set with selected features
# X_test_selected = X_test_last_cycles_lr[selected_features]

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles = model.predict(X_test_selected)

# # Calculate RMSE and R² Score
# mse = mean_squared_error(true_rul_lr, y_pred_last_cycles)
# rmse = np.sqrt(mse)
# r2 = r2_score(true_rul_lr, y_pred_last_cycles)

# print(f'Linear Regression with selected features - Root Mean Squared Error (RMSE): {rmse}')
# print(f'Linear Regression with selected features - R² Score: {r2}')


In [48]:
# from sklearn.feature_selection import RFECV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error, r2_score

# # Select features and target
# features = train_data_cleaned.columns[2:-1]
# X_train = train_data_cleaned[features]
# y_train = train_data_cleaned['RUL']

# # Define the model
# model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

# # Define the RFECV (Recursive Feature Elimination with Cross-Validation) with the model
# rfecv_rf = RFECV(estimator=model_rf, step=1, cv=5, scoring='neg_mean_squared_error')

# # Fit the RFECV
# rfecv_rf.fit(X_train, y_train)

# # Get the optimal number of features
# optimal_num_features_rf = rfecv_rf.n_features_

# # Get the ranking of features
# ranking_rf = rfecv_rf.ranking_

# # Get the support (selected features)
# support_rf = rfecv_rf.support_

# # Get the selected features
# selected_features_rf = features[support_rf]

# print(f'Optimal number of features: {optimal_num_features_rf}')
# print(f'Selected features: {selected_features_rf.tolist()}')

# # Train the model with selected features
# X_train_selected_rf = X_train[selected_features_rf]
# model_rf.fit(X_train_selected_rf, y_train)

# # Extract the last cycle for each engine in the test set with selected features
# X_test_selected_rf = X_test_last_cycles_rf[selected_features_rf]

# # Make predictions for the last cycles of each engine
# y_pred_last_cycles_rf = model_rf.predict(X_test_selected_rf)

# # Calculate RMSE and R² Score
# mse_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
# rmse_rf = np.sqrt(mse_rf)
# r2_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)

# print(f'Random Forest with selected features - Root Mean Squared Error (RMSE): {rmse_rf}')
# print(f'Random Forest with selected features - R² Score: {r2_rf}')
