In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# =============================
# Load dataset
# =============================
df = pd.read_csv("runtime_dataset_final.csv")

# =============================
# Preserve batch_id
# =============================
batch_id = df["batch_id"].copy()

# =============================
# Drop non-ML columns
# =============================
df = df.drop(columns=[
    "run_id",
    "batch_id",
    "workload_name",
    "disk_type"
])

# =============================
# Encode categorical feature
# =============================
df = pd.get_dummies(df, columns=["workload_type"], drop_first=True)

# =============================
# Separate features and target
# =============================
X = df.drop(columns=["runtime_sec"])
y = df["runtime_sec"]

# =============================
# Reattach batch_id for split
# =============================
data = X.copy()
data["runtime_sec"] = y
data["batch_id"] = batch_id

# =============================
# Batch-based train / test split
# =============================
unique_batches = data["batch_id"].unique()
np.random.seed(42)
np.random.shuffle(unique_batches)

n = len(unique_batches)
train_batches = unique_batches[:int(0.7 * n)]
test_batches  = unique_batches[int(0.85 * n):]

train_df = data[data["batch_id"].isin(train_batches)]
test_df  = data[data["batch_id"].isin(test_batches)]

X_train = train_df.drop(columns=["runtime_sec", "batch_id"])
y_train = train_df["runtime_sec"]

X_test = test_df.drop(columns=["runtime_sec", "batch_id"])
y_test = test_df["runtime_sec"]

# =============================
# Train Random Forest with progress bar
# =============================
rf = RandomForestRegressor(
    n_estimators=1,
    max_depth=20,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    warm_start=True
)

TOTAL_TREES = 300

for i in tqdm(range(1, TOTAL_TREES + 1), desc="Training Random Forest"):
    rf.n_estimators = i
    rf.fit(X_train, y_train)

# =============================
# Predict on test set
# =============================
test_preds = rf.predict(X_test)

# =============================
# Regression metrics
# =============================
mae = mean_absolute_error(y_test, test_preds)
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
r2 = r2_score(y_test, test_preds)
accuracy = r2 * 100

print("Test MAE      :", mae)
print("Test RMSE     :", rmse)
print("Test R2 Score :", r2)
print("Accuracy (%)  :", accuracy)

print("\nInterpretation:")
print(f"MAE  -> On average, prediction is off by {mae:.2f} seconds")
print("RMSE -> Penalizes large runtime prediction errors")
print("R2   -> Goodness of fit")
print(f"Accuracy -> Model explains {accuracy:.2f}% of runtime variance")

# =============================
# Inspect sample predictions
# =============================
comparison = pd.DataFrame({
    "Actual_Runtime_sec": y_test.values[:10],
    "Predicted_Runtime_sec": test_preds[:10],
    "Absolute_Error_sec": np.abs(y_test.values[:10] - test_preds[:10])
})

print("\nSample Test Predictions:")
print(comparison)

# =============================
# Save trained model
# =============================
joblib.dump(rf, "random_forest_runtime_predictor.pkl")


Training Random Forest: 100%|██████████| 300/300 [00:11<00:00, 25.98it/s]


Test MAE      : 4.926990788022755
Test RMSE     : 7.910429008056317
Test R2 Score : 0.9846588694056813
Accuracy (%)  : 98.46588694056813

Interpretation:
MAE  -> On average, prediction is off by 4.93 seconds
RMSE -> Penalizes large runtime prediction errors
R2   -> Goodness of fit
Accuracy -> Model explains 98.47% of runtime variance

Sample Test Predictions:
   Actual_Runtime_sec  Predicted_Runtime_sec  Absolute_Error_sec
0              286.62             226.392695           60.227305
1              162.04             166.909068            4.869068
2              155.60             169.639761           14.039761
3              210.49             201.565549            8.924451
4              192.91             201.215996            8.305996
5              169.35             166.606396            2.743604
6              136.86             145.965185            9.105185
7              226.09             215.516966           10.573034
8              227.48             211.456998         

['random_forest_runtime_predictor.pkl']