In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import losses
from sklearn.metrics import precision_score, recall_score, classification_report

import warnings
warnings.filterwarnings('ignore')

2024-02-17 08:44:43.067165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-17 08:44:43.067295: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-17 08:44:43.212244: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
data_set = pd.read_csv('../input/machine-predictive-maintenance-classification/predictive_maintenance.csv')

In [3]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [4]:
# ワンホットエンコーディング
data = pd.get_dummies(data_set.loc[:, 'Type':'Target'], dtype='float64')

# スケール化
data.loc[:, 'Air temperature [K]':'Tool wear [min]'] = MinMaxScaler().fit_transform(data.loc[:, 'Air temperature [K]':'Tool wear [min]'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Air temperature [K]      10000 non-null  float64
 1   Process temperature [K]  10000 non-null  float64
 2   Rotational speed [rpm]   10000 non-null  float64
 3   Torque [Nm]              10000 non-null  float64
 4   Tool wear [min]          10000 non-null  float64
 5   Target                   10000 non-null  int64  
 6   Type_H                   10000 non-null  float64
 7   Type_L                   10000 non-null  float64
 8   Type_M                   10000 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 703.2 KB


In [5]:
# Target==0/1データに分割
data_0 = data[data.Target == 0]
data_1 = data[data.Target == 1]

# 訓練/テストに分割
train, test = train_test_split(data_0, train_size=0.9, random_state=42)
test = pd.concat([test, data_1])

print('train: {}'.format(np.bincount(np.array(train['Target']))))
print('test:  {}'.format(np.bincount(np.array(test['Target']))))

train: [8694]
test:  [967 339]


In [6]:
# 訓練データを特徴量/ターゲットに分割
X_train = train.drop(['Target'], axis=1)
y_train = pd.DataFrame(train.Target)

# テストデータを特徴量/ターゲットに分割
X_test = test.drop(['Target'], axis=1)
y_test = pd.DataFrame(test.Target)

# オートエンコーダ 実装～予測(異常検知)

In [7]:
# モデル定義
inp = Input(shape=(X_train.shape[1],))
encoded = Dense(4, activation='relu')(inp)
decoded = Dense(8, activation='relu')(encoded)

In [8]:
# モデルコンパイル
autoencoder = Model(inp, decoded)
autoencoder.compile(optimizer='adam', loss=losses.MeanSquaredError()) # Optimizers:Adamアルゴリズム, 損失関数:平均二乗誤差

In [9]:
# モデル学習
autoencoder.fit(X_train, y_train, epochs=50, batch_size=256, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7b75375b8e80>

In [10]:
# 訓練データに対する処理
decoded_train = autoencoder.predict(X_train) # 訓練データ再構築
error_train = np.mean(np.abs(X_train - decoded_train), axis=1) # 構築前データと再構築データとの誤差に対し, 各データポイントごとに誤差平均を求める
threshold = np.mean(error_train) # しきい値をすべてのデータポイントの誤差平均の平均とおく



In [11]:
# テストデータに対する処理
decoded_test = autoencoder.predict(X_test) # テストデータ再構築
error_test = np.mean(np.abs(X_test - decoded_test), axis=1) # 構築前データと再構築データとの誤差に対し, 各データポイントごとに(すべての特徴量の)平均値を求める



In [12]:
# 異常検知
anomalies = error_test > threshold # 再構築したテストデータの誤差がスレッショルド以上なら異常
print('recall:     {:.3f}'.format(recall_score(np.array(y_test), np.array(anomalies))))
print('precision:  {:.3f}'.format(precision_score(np.array(y_test), np.array(anomalies))))

recall:     0.844
precision:  0.383


In [13]:
# Classification Report
print(classification_report(np.array(y_test), np.array(anomalies)))

              precision    recall  f1-score   support

           0       0.91      0.52      0.66       967
           1       0.38      0.84      0.53       339

    accuracy                           0.61      1306
   macro avg       0.64      0.68      0.59      1306
weighted avg       0.77      0.61      0.63      1306



In [14]:
"""
ランダムフォレスト(重み付け)

              precision    recall  f1-score   support

           0       0.99      0.85      0.92      2907
           1       0.15      0.81      0.25        93

    accuracy                           0.85      3000
   macro avg       0.57      0.83      0.59      3000
weighted avg       0.97      0.85      0.90      3000
"""

'\nランダムフォレスト(重み付け)\n\n              precision    recall  f1-score   support\n\n           0       0.99      0.85      0.92      2907\n           1       0.15      0.81      0.25        93\n\n    accuracy                           0.85      3000\n   macro avg       0.57      0.83      0.59      3000\nweighted avg       0.97      0.85      0.90      3000\n'