In [2]:
import pandas as pd

df = pd.read_csv("../data/ai4i2020.csv")   # make sure the file path is correct



In [4]:
print(df.columns)


Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')


In [16]:
id_cols = ["UDI", "Product ID"]
targets = ["TWF", "HDF", "PWF", "OSF", "RNF"]

# Drop Machine failure, ID columns, and all reason columns from X
X = df.drop(columns=["Machine failure"] + id_cols + targets)
y = df[targets]


In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = [
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]"
]
categorical_features = ["Type"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [18]:
from sklearn.model_selection import train_test_split

# X and y should already be defined
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,        # 20% for testing
    random_state=42,
    stratify=y["Machine failure"] if "Machine failure" in y.columns else None
)


In [19]:
from sklearn.pipeline import Pipeline


targets = ["TWF", "HDF", "PWF", "OSF", "RNF"]
X = df.drop(columns=["Machine failure"] + id_cols)   # drop only Machine failure + IDs
y = df[targets]   # multi-label

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ("prep", preprocess),    # your ColumnTransformer
    ("model", MultiOutputClassifier(
        RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)
    ))
])
clf.fit(X_train, y_train)

In [13]:
import joblib
joblib.dump(clf, "failure_reason_model.joblib")


['failure_reason_model.joblib']

In [20]:
import pandas as pd, joblib
rmodel = joblib.load("failure_reason_model.joblib")

row = pd.DataFrame([{
    "Type":"L","Air temperature [K]":280,"Process temperature [K]":360,
    "Rotational speed [rpm]":1900,"Torque [Nm]":75,"Tool wear [min]":280,
    "Temp diff [K]":360-280
}])
print(rmodel.predict(row))


[[0 0 0 0 0]]


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y.any(axis=1)
)

clf = Pipeline([
    ("prep", preprocess),  # your ColumnTransformer
    ("model", MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=300,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        )
    ))
])
clf.fit(X_train, y_train)


In [22]:
print(clf.predict(X_test[:5]))


[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

base = RandomForestClassifier(
    n_estimators=500,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
clf = Pipeline([
    ("prep", preprocess),
    ("model", MultiOutputClassifier(base))
])
clf.fit(X_train, y_train)


In [24]:
from sklearn.metrics import classification_report
pred = clf.predict(X_test)
print(classification_report(y_test, pred, target_names=targets, zero_division=0))


              precision    recall  f1-score   support

         TWF       0.00      0.00      0.00         7
         HDF       0.93      0.52      0.67        27
         PWF       0.85      0.79      0.81        14
         OSF       1.00      0.65      0.79        23
         RNF       0.00      0.00      0.00         2

   micro avg       0.93      0.55      0.69        73
   macro avg       0.56      0.39      0.45        73
weighted avg       0.82      0.55      0.65        73
 samples avg       0.02      0.02      0.02        73



In [25]:
print(clf.predict(X_test[:5]))


[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


In [26]:
import joblib
joblib.dump(clf, "failure_reason_model.joblib")

['failure_reason_model.joblib']