In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [121]:
df = pd.read_csv("ai4i2020.csv")
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [122]:
df = df.drop(columns=['UDI'])

In [123]:
df.columns = df.columns.str.replace(r"\[.*?\]", "", regex=True).str.strip()

In [124]:
df.columns

Index(['Product ID', 'Type', 'Air temperature', 'Process temperature',
       'Rotational speed', 'Torque', 'Tool wear', 'Machine failure', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [125]:
df['Machine failure'].value_counts()

Machine failure
0    9661
1     339
Name: count, dtype: int64

In [126]:
failure_flags = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
print(df[failure_flags].sum())

TWF     46
HDF    115
PWF     95
OSF     98
RNF     19
dtype: int64


In [127]:
def build_failure_type(row):
    flags = [f for f in failure_flags if row.get(f, 0) == 1]
    if len(flags) == 0:
        return 'no_failure'
    if len(flags) == 1:
        return flags[0]
    return 'multiple'

df['failure_type'] = df.apply(build_failure_type, axis=1)
print(df['failure_type'].value_counts())

failure_type
no_failure    9652
HDF            106
PWF             80
OSF             78
TWF             42
multiple        24
RNF             18
Name: count, dtype: int64


## Model 1: Failure Prediction

In [128]:
features = ['Product ID', 'Type', 'Air temperature', 'Process temperature', 'Rotational speed', 'Torque', 'Tool wear']
X = df[features]

In [129]:
X

Unnamed: 0,Product ID,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear
0,M14860,M,298.1,308.6,1551,42.8,0
1,L47181,L,298.2,308.7,1408,46.3,3
2,L47182,L,298.1,308.5,1498,49.4,5
3,L47183,L,298.2,308.6,1433,39.5,7
4,L47184,L,298.2,308.7,1408,40.0,9
...,...,...,...,...,...,...,...
9995,M24855,M,298.8,308.4,1604,29.5,14
9996,H39410,H,298.9,308.4,1632,31.8,17
9997,M24857,M,299.0,308.6,1645,33.4,22
9998,H39412,H,299.0,308.7,1408,48.5,25


In [130]:
cat_cols = ['Product ID', 'Type']
encoder = LabelEncoder()
for col in cat_cols:
     X[col] = encoder.fit_transform(X[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = encoder.fit_transform(X[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = encoder.fit_transform(X[col])


In [131]:
num_cols = [c for c in X.columns if c not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[num_cols] = scaler.fit_transform(X[num_cols])


In [132]:
y = df['Machine failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [133]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print("Random Forest Precision:", precision_score(y_test, y_pred))
print("Random Forest Recall:", recall_score(y_test, y_pred))

Random Forest Precision: 0.8260869565217391
Random Forest Recall: 0.6229508196721312


In [134]:
gradient_boost = GradientBoostingClassifier()
gradient_boost.fit(X_train, y_train)
y_pred = gradient_boost.predict(X_test)
print("Gradient Boosting Precision:", precision_score(y_test, y_pred))
print("Gradient Boosting Recall:", recall_score(y_test, y_pred))

Gradient Boosting Precision: 0.7906976744186046
Gradient Boosting Recall: 0.5573770491803278


In [135]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
y_pred= logistic.predict(X_test)
print("Logistic Regression Precision:", precision_score(y_test, y_pred))
print("Logistic Regression Recall:", recall_score(y_test, y_pred))

Logistic Regression Precision: 0.6111111111111112
Logistic Regression Recall: 0.18032786885245902


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model 2: Failure Type Classification

In [136]:
y = df['failure_type']
l_encoder = LabelEncoder()
y = l_encoder.fit_transform(y)
y

array([6, 6, 6, ..., 6, 6, 6], shape=(10000,))

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [138]:
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, y_train)
y_pred_dt = d_tree.predict(X_test)
print("Decision Tree F1 Score:", f1_score(y_test, y_pred_dt, average='weighted'))

Decision Tree F1 Score: 0.9768543130206755


In [139]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM F1 Score:", f1_score(y_test, y_pred_svm, average='weighted'))

SVM F1 Score: 0.9478117048346056


In [140]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes F1 Score:", f1_score(y_test, y_pred_nb, average='weighted'))

Naive Bayes F1 Score: 0.9537763621958775


In [141]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
print("XGBoost F1 Score:", f1_score(y_test, y_pred_xgb, average='weighted'))

XGBoost F1 Score: 0.977184757654464


In [142]:
cm = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[  20    0    0    0    0    0    1]
 [   0    8    0    0    0    2    6]
 [   0    0   11    0    0    0    5]
 [   0    0    0    0    0    0    4]
 [   0    0    0    0    0    0    8]
 [   0    4    1    0    0    0    0]
 [   2    2    2    0    0    1 1923]]
