In [1]:
import psycopg2
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
import configparser
import warnings
warnings.filterwarnings('ignore')

In [2]:
def fetch_data():
    config = configparser.ConfigParser()
    config.read('db_config.ini')

    host = config['database']['host']
    port = config['database'].getint('port')
    user = config['database']['user']
    password = config['database']['password']
    database = config['database']['database']

    connection = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        database=database
    )

    df1 = pd.read_sql('SELECT * FROM turbofan_engine_data', con=connection)
    df2 = pd.read_sql('SELECT * FROM turbofan_rul_data', con=connection)

    connection.close()

    return df1, df2

In [3]:
df1, df2 = fetch_data()

In [4]:
def clean_data(df):
    # Handling missing values
    na_counts_before = df.isna().sum()
    if na_counts_before.sum() > 0:
        print(f"Rows with missing values before: {na_counts_before.sum()}")
        df = df.dropna(axis=0)
        na_counts_after = df.isna().sum()
        print(f"Rows with missing values after: {na_counts_after.sum()}")
    else:
        print("No missing values found. Proceeding with the original DataFrame.")

    # Handling duplicates
    duplicates_before = df[df.duplicated(keep='first')]
    if len(duplicates_before) > 0:
        print(f"Duplicates before: {len(duplicates_before)}")
        df = df.drop_duplicates(keep='first')
        duplicates_after = df[df.duplicated(keep='first')]
        print(f"Duplicates after: {len(duplicates_after)}")
    else:
        print("No duplicate rows found. Proceeding with the original DataFrame.")

    return df

In [5]:
df1 = clean_data(df1)
df2 = clean_data(df2)

No missing values found. Proceeding with the original DataFrame.
No duplicate rows found. Proceeding with the original DataFrame.
No missing values found. Proceeding with the original DataFrame.
No duplicate rows found. Proceeding with the original DataFrame.


In [6]:
train_df = df1[df1["source"] == 0].copy()
test_df = df1[df1["source"] == 1].copy()

# **Deriving RUL Column**

In [7]:
def add_rul_train(df):
    max_cycles_train = df.groupby('engine')['cycle'].max().reset_index()
    max_cycles_train.columns = ['engine', 'max_cycle_train']
    df = df.merge(max_cycles_train, on='engine', how='left')
    df['RUL'] = df['max_cycle_train'] - df['cycle']
    df.drop(['max_cycle_train'], axis=1, inplace=True)
    return df

In [8]:
def add_rul_test(df, external_rul_df):
    df = df.merge(external_rul_df[['engine', 'RUL']], on='engine', how='left')
    return df

In [9]:
train_df_with_rul = add_rul_train(train_df)
test_df_with_rul = add_rul_test(test_df, df2)

In [10]:
test_last_cycle = test_df_with_rul.groupby('engine')['cycle'].idxmax()
test_last_cycle_df = test_df_with_rul.loc[test_last_cycle]

# **Performing Recursive Feature Elimination (RFE)**

In [11]:
X = train_df_with_rul.drop(['RUL','index','engine','cycle','source'], axis=1)
y = train_df_with_rul['RUL']

lr = LinearRegression()
rfe = RFE(estimator=lr, n_features_to_select=15)
rfe = rfe.fit(X, y)

selected_features = X.columns[rfe.support_]
print(f"Selected features: {selected_features}")

Selected features: Index(['LPC_outlet_temperature (T24)', 'fan_inlet_pressure (P2)',
       'bypass_duct_pressure (P15)', 'HPC_outlet_static_pressure (Ps30)',
       'mach_number (mach)', 'throttle_resolver_angle (TRA)',
       'fan_inlet_temperature (T2)', 'engine_pressure_ratio (epr)',
       'corrected_fan_speed (NRf)', 'bypass_ratio (BPR)',
       'burner_fuel_air_ratio (farB)', 'demanded_fan_speed (Nf_dmd)',
       'demanded_corrected_fan_speed (PCNfR_dmd)', 'HPT_coolant_bleed (W31)',
       'LPT_coolant_bleed (W32)'],
      dtype='object')


In [12]:
X_train = train_df_with_rul[selected_features]
y_train = train_df_with_rul['RUL']

In [13]:
X_test = test_last_cycle_df[selected_features]
y_test = test_last_cycle_df['RUL']

In [14]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    rmse = int(np.sqrt(mean_squared_error(y, y_pred)))
    mae = int(mean_absolute_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    return rmse, mae, round(r2, 4)

# **Model Training**

In [16]:
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)

train_rmse, train_mae, train_r2 = evaluate_model(model_lr, X_train_scaled, y_train)
print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}, Train R²: {train_r2}')

test_rmse, test_mae, test_r2 = evaluate_model(model_lr, X_test_scaled, y_test)
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}, Test R²: {test_r2}')

Train RMSE: 46, Train MAE: 35, Train R²: 0.5517
Test RMSE: 33, Test MAE: 27, Test R²: 0.6062


In [17]:
model_svr = SVR()
model_svr.fit(X_train_scaled, y_train)

train_rmse, train_mae, train_r2 = evaluate_model(model_svr, X_train_scaled, y_train)
print(f'Train RMSE: {train_rmse}, Train MAE: {train_mae}, Train R²: {train_r2}')

test_rmse, test_mae, test_r2 = evaluate_model(model_svr, X_test_scaled, y_test)
print(f'Test RMSE: {test_rmse}, Test MAE: {test_mae}, Test R²: {test_r2}')

Train RMSE: 64, Train MAE: 51, Train R²: 0.1362
Test RMSE: 51, Test MAE: 42, Test R²: 0.0902


In [18]:
model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train_scaled, y_train)

train_rmse_rf, train_mae_rf, train_r2_rf = evaluate_model(model_rf, X_train_scaled, y_train)
print(f'Train RMSE (RF): {train_rmse_rf}, Train MAE (RF): {train_mae_rf}, Train R² (RF): {train_r2_rf}')

test_rmse_rf, test_mae_rf, test_r2_rf = evaluate_model(model_rf, X_test_scaled, y_test)
print(f'Test RMSE (RF): {test_rmse_rf}, Test MAE (RF): {test_mae_rf}, Test R² (RF): {test_r2_rf}')


Train RMSE (RF): 17, Train MAE (RF): 12, Train R² (RF): 0.9387
Test RMSE (RF): 33, Test MAE (RF): 24, Test R² (RF): 0.6149


In [19]:
model_xgb = XGBRegressor(random_state=42)
model_xgb.fit(X_train_scaled, y_train)

train_rmse_xgb, train_mae_xgb, train_r2_xgb = evaluate_model(model_xgb, X_train_scaled, y_train)
print(f'Train RMSE (XGB): {train_rmse_xgb}, Train MAE (XGB): {train_mae_xgb}, Train R² (XGB): {train_r2_xgb}')

test_rmse_xgb, test_mae_xgb, test_r2_xgb = evaluate_model(model_xgb, X_test_scaled, y_test)
print(f'Test RMSE (XGB): {test_rmse_xgb}, Test MAE (XGB): {test_mae_xgb}, Test R² (XGB): {test_r2_xgb}')

Train RMSE (XGB): 40, Train MAE (XGB): 29, Train R² (XGB): 0.6622
Test RMSE (XGB): 31, Test MAE (XGB): 23, Test R² (XGB): 0.6608
