In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib


In [9]:
df = pd.read_csv("../data/ai4i2020.csv")
df.head()


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [10]:
targets = ["TWF", "HDF", "PWF", "OSF", "RNF"]
id_cols = ["UDI", "Product ID"]

# Sensor inputs only
X = df.drop(columns=["Machine failure"] + id_cols + targets)
# Multi-label target
y = df[targets]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y.any(axis=1)
)

In [12]:
cat_cols = ["Type"]
num_cols = [c for c in X.columns if c not in cat_cols]

preprocess = ColumnTransformer([
    ("cat", Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
    ("num", Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale", StandardScaler())
    ]), num_cols)
])

In [13]:
reason_pipe = Pipeline([
    ("prep", preprocess),
    ("model", MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=400,
            class_weight="balanced_subsample",
            random_state=42,
            n_jobs=-1
        )
    ))
])


In [14]:
reason_pipe.fit(X_train, y_train)

In [15]:
y_pred = reason_pipe.predict(X_test)

print(classification_report(
    y_test,
    y_pred,
    target_names=targets,
    zero_division=0
))

              precision    recall  f1-score   support

         TWF       0.00      0.00      0.00         7
         HDF       0.93      0.48      0.63        27
         PWF       0.85      0.79      0.81        14
         OSF       1.00      0.65      0.79        23
         RNF       0.00      0.00      0.00         2

   micro avg       0.93      0.53      0.68        73
   macro avg       0.55      0.38      0.45        73
weighted avg       0.82      0.53      0.64        73
 samples avg       0.02      0.02      0.02        73



In [16]:
joblib.dump(reason_pipe, "failure_reason_model.joblib")
print("Model saved to notebooks/failure_reason_model.joblib")


Model saved to notebooks/failure_reason_model.joblib


In [17]:
import pandas as pd
df = pd.read_csv("../data/ai4i2020.csv")
sample = df[(df["Machine failure"]==1) & (df["HDF"]==1)].head(1)
print(sample)


       UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
3236  3237     M18096    M                300.8                    309.4   

      Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  \
3236                    1342         62.4              113                1   

      TWF  HDF  PWF  OSF  RNF  
3236    0    1    0    0    0  


In [18]:
import pandas as pd
df = pd.read_csv("../data/ai4i2020.csv")
sample = df[(df["Machine failure"]==1) & (df["TWF"]==1)].head(1)
print(sample)

    UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
77   78     L47257    L                298.8                    308.9   

    Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  \
77                    1455         41.3              208                1   

    TWF  HDF  PWF  OSF  RNF  
77    1    0    0    0    0  
