In [None]:
import pandas as pd

df = pd.read_csv("runtime_dataset_final.csv")
df.head()


Unnamed: 0,run_id,batch_id,workload_type,workload_name,workload_complexity,cpu_cores,memory_total_gb,cpu_avg_pct,effective_cpu,memory_avg_gb,memory_pressure,disk_read_mb,disk_write_mb,io_intensity,disk_type,disk_speed_class,runtime_sec
0,0,batch_1,ML,ml_resnet,5,2,4,95.0,1.9,2.02,0.505,43.57,135.91,179.48,NVMe,3,179.06
1,1,batch_1,ML,ml_resnet,5,2,4,95.0,1.9,2.48,0.62,26.85,47.17,74.02,NVMe,3,176.76
2,2,batch_1,ML,ml_resnet,5,2,4,95.0,1.9,2.92,0.73,116.49,86.37,202.86,NVMe,3,152.71
3,3,batch_1,ML,ml_resnet,5,2,4,95.0,1.9,1.81,0.453,205.34,128.68,334.02,SSD,2,190.32
4,4,batch_1,ML,ml_resnet,5,2,4,95.0,1.9,1.96,0.49,107.38,71.03,178.41,SSD,2,199.42


In [None]:
y = df["runtime_sec"]


In [None]:
numeric_features = [
    "cpu_cores",
    "memory_total_gb",
    "cpu_avg_pct",
    "effective_cpu",
    "memory_avg_gb",
    "disk_speed_class"
]

categorical_features = [
    "workload_type",      # ML / DB / WEB
    "workload_name"       # ml_bert, ml_resnet, tpch_q3, wrk_high, etc.
]

X = df[numeric_features + categorical_features]


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("rf", RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    ))
])


In [None]:
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = model.predict(X_test)

print("MAE (seconds):", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MAE (seconds): 4.437878677817039
R² Score: 0.9844780303972568


In [None]:
new_run = pd.DataFrame([{
    "cpu_cores": 4,
    "memory_total_gb": 8,
    "cpu_avg_pct": 80.0,
    "effective_cpu": 3.2,
    "memory_avg_gb": 6.0,
    "disk_speed_class": 3,
    "workload_type": "ML",
    "workload_name": "ml_bert"
}])

predicted_runtime = model.predict(new_run)
print("Predicted runtime (seconds):", predicted_runtime[0])


Predicted runtime (seconds): 95.79692500000004
