In [2]:
import pandas as pd
import numpy as np
# Load the dataset
df = pd.read_csv("cleaned_dataset.csv")



In [3]:



import numpy as np

np.random.seed(42)

# Non-linear age effect (e.g., exponential decay)
age_effect = 30 * np.exp(-0.1 * df['age'])

# Usage effect: more production hours reduces RUL
usage_effect = 0.0005 * df['production_hours']

# Maintenance effect: more replacement parts reduces RUL
maintenance_effect = 0.3 * df['replacement_parts']

# Add realistic noise proportional to age but capped to avoid extreme values
noise_scale = np.minimum(0.3 * df['age'], 3)  # max noise std dev = 3
noise = np.random.normal(0, noise_scale)

# Calculate RUL with all effects
df['RUL'] = age_effect - usage_effect - maintenance_effect + noise

# Clip negative values to zero and convert to int
df['RUL'] = df['RUL'].clip(lower=0).round().astype(int)


# Optional: Also add RUL_hours version
# df['RUL_hours'] = 5520 - df['production_hours']

df.head()


Unnamed: 0,plant,process,master_data_name,location,immobilization_number,serial_number,year,equipment_commercialized,pdr_commercialized,age,degradability,production_hours,hp_coefficient,replacement_parts,maintenance_mttr_mtbf,technology,aging_result,equipment_status,RUL
0,LTN3B,Cutting Machine Tube,Ulmer--SM 15/2P,coupe,immo648572,9063,2008,True,True,17,0.7,5107,1.0,0,0,0,0.52,Moyen,4
1,LTN3B,Cutting Machine Wire < 6mm2,Komax--Alpha 355,coupe,4141150502,2622,2015,False,True,10,0.4,5107,1.0,0,0,1,0.39,Bon,8
2,LTN3B,Cutting Machine Wire < 6mm2,Komax--Gamma 333,coupe,2141230701,1978,2011,False,True,14,0.4,5107,1.0,0,0,1,0.39,Bon,7
3,LTN3B,Cutting Machine Hot,HSGM--HSG-G2S,coupe,2141050303,205112,2005,True,True,20,0.7,5107,1.0,0,0,0,0.52,Moyen,6
4,LTN3B,Cutting Machine Hot,HSGM--HSG-G2S,coupe,2141200801,620275,2020,True,True,5,0.0,5107,1.0,0,0,0,0.1,Excellent,15


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# === Drop unnecessary/calculated columns ===
drop_cols = [
    'immobilization_number', 'serial_number',
    'degradability', 'aging_result', 'equipment_status', 'technology'
]
df.drop(columns=drop_cols, inplace=True, errors='ignore')


# === Select features ===
features = [
    'plant', 'process', 'master_data_name', 'location',
    'year', 'equipment_commercialized', 'pdr_commercialized',
    'age', 'production_hours', 'hp_coefficient',
    'replacement_parts', 'maintenance_mttr_mtbf'
]
# Filter the final dataset
df = df[features + ['RUL']].dropna()

# === Encode categorical features ===
categorical_features = ['plant', 'process', 'master_data_name', 'location']
encoders = {}

for col in categorical_features:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])
    encoders[col] = encoder  # Save for inference later
df.head()

# === Split data ===
X = df[features]
y = df['RUL']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
# === Train model ===
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
import numpy as np

# Define the model
model = xgb.XGBRegressor(random_state=42)

# Define hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=30,                 # Number of parameter settings sampled
    scoring='neg_mean_absolute_error',  # Or use 'r2' for RÂ²
    cv=3,                      # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1                  # Use all CPU cores
)

# Fit to data
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best CV MAE:", -random_search.best_score_)

# You can use the best model directly:
best_model = random_search.best_estimator_


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best parameters found: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.03, 'colsample_bytree': 0.6}
Best CV MAE: 1.5223281780878704


In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define model with given best params
model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict on test and train sets
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)

print("Model Evaluation with best parameters:")
print(f" RMSE: {rmse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test RÂ²: {r2_test:.2f}")
print(f" RÂ² train Score: {r2_train:.2f}")


Model Evaluation with best parameters:
 RMSE: 2.04
Test MAE: 1.50
Test RÂ²: 0.89
 RÂ² train Score: 0.91


In [6]:
sample_equipment = {
    'plant': 'LTN3B',
    'process': 'Cutting Machine Wire < 6mm2',
    'master_data_name': 'Komax--Alpha 355 S',
    'location': 'coupe',
    'year': 2015,
    'equipment_commercialized':0,
    'pdr_commercialized': 1,
    'age': 10,
    'production_hours':5107,
    'hp_coefficient': 1,
    'replacement_parts': 0,
    'maintenance_mttr_mtbf': 0
}
import pandas as pd
import numpy as np

# Create a DataFrame from the sample
df_real = pd.DataFrame([sample_equipment])

# Encode categorical columns using saved encoders
categorical_features = ['plant', 'process', 'master_data_name', 'location']
for col in categorical_features:
    df_real[col] = encoders[col].transform(df_real[col])

# Ensure column order matches training
features = [
    'plant', 'process', 'master_data_name', 'location',
    'year', 'equipment_commercialized', 'pdr_commercialized',
    'age', 'production_hours', 'hp_coefficient',
    'replacement_parts', 'maintenance_mttr_mtbf'
]

# Predict RUL
predicted_rul = model.predict(df_real[features])[0]
print(f"ðŸ“Š Predicted Remaining Useful Life (RUL): {predicted_rul:.2f} years")


ðŸ“Š Predicted Remaining Useful Life (RUL): 8.80 years
