In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings("ignore")

# DATASET INITIALIZATION

In [4]:
import pandas as pd

# Load training and test data
train_data = pd.read_csv('/content/drive/MyDrive/train_FD002.txt', delim_whitespace=True, header=None)
test_data = pd.read_csv('/content/drive/MyDrive/test_FD002.txt', delim_whitespace=True, header=None)
rul_data = pd.read_csv('/content/drive/MyDrive/RUL_FD002.txt', delim_whitespace=True, header=None)

In [5]:
train_data.shape

(53759, 26)

In [6]:
column_names = ["engine_id" , "time_in_cycles" , "altitude" , "mach_no" , "throttle_angle" , "fan_inlet_temp" , "LPC_outlet_temp" , "HPC_outlet_temp" , "LPT_outlet_temp" , "fan_inlet_pressure" , "bypass_duct_pressure" , "HPC_outlet_pressure" , "fan_speed" , "core_speed" , "engine_pressure_ratio" , "HPC_outlet_static_pressure" , "fuel_ps30_ratio" , "corrected_fan_speed" , "corrected_core_speed" , "bypass_ratio" , "burner_fuel_air_ratio" , "bleed_enthalpy" , "demanded_fan_speed" , "demanded_corrected_fan_speed" , "HPT_coolant_bleed" , "LPT_coolant_bleed"]
train_data.columns = column_names
test_data.columns = column_names

In [7]:
train_data['RUL'] = train_data.groupby('engine_id')['time_in_cycles'].transform(max) - train_data['time_in_cycles']

# REMOVING OUTLIERS

In [8]:
from scipy import stats
import numpy as np

# Calculate Z-scores for sensor measurements and operational settings
z_scores = np.abs(stats.zscore(train_data.iloc[:, 2:-1]))

# Set a threshold for Z-score
threshold = 3

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Drop outliers
train_data_cleaned = train_data[~outliers]


In [9]:
# Calculate Z-scores for test data
z_scores_test = np.abs(stats.zscore(test_data.iloc[:, 2:]))

# Identify outliers in test data
outliers_test = (z_scores_test > threshold).any(axis=1)

# Drop outliers from test data
test_data_cleaned = test_data[~outliers_test]

# SCALING

In [10]:
from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
train_data_cleaned.iloc[:, 2:-1] = scaler.fit_transform(train_data_cleaned.iloc[:, 2:-1])
test_data.iloc[:, 2:] = scaler.transform(test_data.iloc[:, 2:])

# RANDOM FOREST

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_rf = train_data_cleaned[features]
y_train_rf = train_data_cleaned['RUL']

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Extract the last cycle for each engine in the test set
last_cycle_indices_rf = test_data_cleaned.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_rf = test_data_cleaned.loc[last_cycle_indices_rf, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_rf = rf_model.predict(X_test_last_cycles_rf)

# True RUL values from the provided RUL file
true_rul_rf = rul_data.values.flatten()

# Calculate RMSE and R² Score
mse_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)
mae_rf = mean_absolute_error(true_rul_rf, y_pred_last_cycles_rf)

print(f'Random Forest - Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'Random Forest - R² Score: {r2_rf}')
print(f'Random Forest - Mean Absolute Error (MAE): {mae_rf}')

Random Forest - Root Mean Squared Error (RMSE): 87.56068274593548
Random Forest - R² Score: -1.650917168978507
Random Forest - Mean Absolute Error (MAE): 69.86169884169884


# HYPERPARAMETER TUNING

In [12]:
# from sklearn.model_selection import GridSearchCV
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# import numpy as np

# # Hyperparameter tuning for Random Forest
# param_grid_rf = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [2, 5, 10]
# }
# grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=5, scoring='neg_mean_squared_error')
# grid_rf.fit(X_train_rf, y_train_rf)
# print("Best parameters for Random Forest:", grid_rf.best_params_)
# print("Best RMSE for Random Forest:", np.sqrt(-grid_rf.best_score_))


In [13]:
# print('completed')

WHATEVER BEST HYPERPARAMETERS ARE RETURNED IN THE ABOVE CELL, REPLACE THEM IN THE BELOW CELL AND THEN RUN THIS ONE

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Select features and target
features = train_data_cleaned.columns[2:-1]
X_train_rf = train_data_cleaned[features]
y_train_rf = train_data_cleaned['RUL']

# Train the Random Forest Regressor model with updated parameters
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=10, random_state=42)
rf_model.fit(X_train_rf, y_train_rf)

# Predictions for training data
y_pred_train_rf = rf_model.predict(X_train_rf)

# Extract the last cycle for each engine in the test set
last_cycle_indices_rf = test_data.groupby('engine_id')['time_in_cycles'].idxmax()
X_test_last_cycles_rf = test_data.loc[last_cycle_indices_rf, features]

# Make predictions for the last cycles of each engine
y_pred_last_cycles_rf = rf_model.predict(X_test_last_cycles_rf)

# True RUL values from the provided RUL file
true_rul_rf = rul_data.values.flatten()

# Calculate RMSE and R² Score for test data
mse_test_rf = mean_squared_error(true_rul_rf, y_pred_last_cycles_rf)
rmse_test_rf = np.sqrt(mse_test_rf)
r2_test_rf = r2_score(true_rul_rf, y_pred_last_cycles_rf)
mae_test_rf = mean_absolute_error(true_rul_rf, y_pred_last_cycles_rf)

# Calculate RMSE and R² Score for train data
mse_train_rf = mean_squared_error(y_train_rf, y_pred_train_rf)
rmse_train_rf = np.sqrt(mse_train_rf)
r2_train_rf = r2_score(y_train_rf, y_pred_train_rf)
mae_train_rf = mean_absolute_error(y_train_rf, y_pred_train_rf)

print(f'Random Forest - Root Mean Squared Error (RMSE) on Test Data: {rmse_test_rf}')
print(f'Random Forest - R² Score on Test Data: {r2_test_rf}')
print(f'Random Forest - Mean Absolute Error (MAE) on Test Data: {mae_test_rf}')
print()
print(f'Random Forest - Root Mean Squared Error (RMSE) on Train Data: {rmse_train_rf}')
print(f'Random Forest - R² Score on Train Data: {r2_train_rf}')
print(f'Random Forest - Mean Absolute Error (MAE) on Train Data: {mae_train_rf}')

Random Forest - Root Mean Squared Error (RMSE) on Test Data: 30.883041318024155
Random Forest - R² Score on Test Data: 0.6702247909684644
Random Forest - Mean Absolute Error (MAE) on Test Data: 22.689182144064738

Random Forest - Root Mean Squared Error (RMSE) on Train Data: 28.38485396256991
Random Forest - R² Score on Train Data: 0.8316499876231283
Random Forest - Mean Absolute Error (MAE) on Train Data: 20.187268638252302
