According to notebook - https://www.kaggle.com/code/c1trus/azure-maint/notebook

# Load & Create dataset


In [1]:
import pandas as pd
import os


dataset_path = os.path.join("..","dataset","Azure Predictive Maintenance")
# Load files
telemetry_df = pd.read_csv(os.path.join(dataset_path,'PdM_telemetry.csv'))
errors_df = pd.read_csv(os.path.join(dataset_path,'PdM_errors.csv'))
maint_df = pd.read_csv(os.path.join(dataset_path,'PdM_maint.csv'))
failures_df = pd.read_csv(os.path.join(dataset_path,'PdM_failures.csv'))
machines_df = pd.read_csv(os.path.join(dataset_path,'PdM_machines.csv'))

In [3]:
#Convert datetime columns to datetime objects
telemetry_df['datetime'] = pd.to_datetime(telemetry_df['datetime'])
errors_df['datetime'] = pd.to_datetime(errors_df['datetime'])
failures_df['datetime'] = pd.to_datetime(failures_df['datetime'])
maint_df['datetime'] = pd.to_datetime(maint_df['datetime'])

In [5]:
df = telemetry_df

original_datetime = df['datetime']
original_machineID = df['machineID']

df_numeric = df.drop(columns=['datetime'])

# Create 24 hours rolling window
rolling_df = df_numeric.rolling(window=24, min_periods=1, closed='both').mean()


rolling_df['datetime'] = original_datetime
rolling_df['machineID'] = original_machineID

combined_df = pd.merge(rolling_df, maint_df, on=['datetime', 'machineID'], how='outer')
combined_df.sort_values(by=['machineID', 'datetime'], inplace=True)

In [6]:
def maint_comp(combined_df):
    for comp in ['comp1', 'comp2', 'comp3', 'comp4']:
        combined_df[f'time_since_last_{comp}'] = 0

    # 初始化上次维护时间的字典
    last_maint_time = {comp: {machine: None for machine in combined_df['machineID'].unique()} for comp in ['comp1', 'comp2', 'comp3', 'comp4']}

    for index, row in combined_df.iterrows():
        machine_id = row['machineID']
        current_time = row['datetime']
        current_comp = row['comp']

        for comp in ['comp1', 'comp2', 'comp3', 'comp4']:
            if last_maint_time[comp].get(machine_id):
                time_diff = (current_time - last_maint_time[comp][machine_id]).total_seconds() / (3600*24)
                combined_df.at[index, f'time_since_last_{comp}'] = time_diff

            if comp == current_comp:
                last_maint_time[comp][machine_id] = current_time

    combined_df = combined_df[combined_df['datetime'] >= '2015-01-01']

    combined_df = combined_df[combined_df['datetime'].dt.hour == 6]

    return combined_df

In [7]:
rolling_combined_df = maint_comp(combined_df)
rolling_combined_df.info()

  combined_df.at[index, f'time_since_last_{comp}'] = time_diff
  combined_df.at[index, f'time_since_last_{comp}'] = time_diff
  combined_df.at[index, f'time_since_last_{comp}'] = time_diff
  combined_df.at[index, f'time_since_last_{comp}'] = time_diff


<class 'pandas.core.frame.DataFrame'>
Index: 37323 entries, 400 to 877222
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   machineID              37323 non-null  int64         
 1   volt                   37323 non-null  float64       
 2   rotate                 37323 non-null  float64       
 3   pressure               37323 non-null  float64       
 4   vibration              37323 non-null  float64       
 5   datetime               37323 non-null  datetime64[ns]
 6   comp                   2886 non-null   object        
 7   time_since_last_comp1  37323 non-null  float64       
 8   time_since_last_comp2  37323 non-null  float64       
 9   time_since_last_comp3  37323 non-null  float64       
 10  time_since_last_comp4  37323 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(1), object(1)
memory usage: 3.4+ MB


One hot Encode error

In [8]:
all_dates = telemetry_df['datetime'].unique()
all_machines = telemetry_df['machineID'].unique()
all_combinations = pd.MultiIndex.from_product([all_dates, all_machines], names=['datetime', 'machineID']).to_frame(index=False)

one_hot_encoded_errors = pd.get_dummies(errors_df, columns=['errorID'])

daily_error_counts = one_hot_encoded_errors.groupby(['machineID', pd.Grouper(key='datetime', freq='D')]).sum().reset_index()

combined_errors = pd.merge(all_combinations, daily_error_counts, on=['datetime', 'machineID'], how='left').fillna(0)

combined_errors.set_index('datetime', inplace=True)

windowed_error_counts = combined_errors.groupby('machineID').rolling(window='3D').sum()

windowed_error_counts.reset_index(inplace=True)

error_with_ID = pd.merge(telemetry_df, windowed_error_counts, on=['machineID', 'datetime'], how='left')

error_with_ID = error_with_ID[error_with_ID['datetime'].dt.hour == 6]

error_with_ID = error_with_ID.drop(['volt','rotate','pressure','vibration'],axis = 1)
error_with_ID.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36600 entries, 0 to 876099
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   datetime        36600 non-null  datetime64[ns]
 1   machineID       36600 non-null  int64         
 2   errorID_error1  36600 non-null  float64       
 3   errorID_error2  36600 non-null  float64       
 4   errorID_error3  36600 non-null  float64       
 5   errorID_error4  36600 non-null  float64       
 6   errorID_error5  36600 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 2.2 MB


In [9]:
error_features_df = pd.merge(telemetry_df, errors_df, on=['datetime', 'machineID'], how='outer')

# 初始化错误时间列
def maint_error(error_features_df):

    error_features_df.drop(['volt','rotate','pressure','vibration'],axis = 1,inplace = True)

    error_types = ['error1', 'error2', 'error3', 'error4', 'error5']
    for error in error_types:
        error_features_df[f'time_since_last_{error}'] = 0

    # 初始化上次错误时间的字典
    last_error_time = {error: {machine: None for machine in error_features_df['machineID'].unique()} for error in error_types}

    # 遍历合并后的记录
    for index, row in error_features_df.iterrows():
        machine_id = row['machineID']
        current_time = row['datetime']
        current_error = row['errorID']

        # 更新所有错误类型的时间
        for error in error_types:
            # 如果存在上次错误时间，则计算时间差
            if last_error_time[error].get(machine_id):
                time_diff = (current_time - last_error_time[error][machine_id]).total_seconds() / (3600 * 24)
                error_features_df.at[index, f'time_since_last_{error}'] = time_diff

            # 对当前出现的错误更新上次错误时间
            if error == current_error:
                last_error_time[error][machine_id] = current_time

    error_features_df = error_features_df[error_features_df['datetime'].dt.hour == 6]

    return error_features_df

In [10]:
error_featured_df = maint_error(error_features_df)
error_featured_df.info()

  error_features_df.at[index, f'time_since_last_{error}'] = time_diff
  error_features_df.at[index, f'time_since_last_{error}'] = time_diff
  error_features_df.at[index, f'time_since_last_{error}'] = time_diff
  error_features_df.at[index, f'time_since_last_{error}'] = time_diff
  error_features_df.at[index, f'time_since_last_{error}'] = time_diff


<class 'pandas.core.frame.DataFrame'>
Index: 36901 entries, 0 to 876402
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   datetime                36901 non-null  datetime64[ns]
 1   machineID               36901 non-null  int64         
 2   errorID                 1122 non-null   object        
 3   time_since_last_error1  36901 non-null  float64       
 4   time_since_last_error2  36901 non-null  float64       
 5   time_since_last_error3  36901 non-null  float64       
 6   time_since_last_error4  36901 non-null  float64       
 7   time_since_last_error5  36901 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(1)
memory usage: 2.5+ MB


Combined all Dataframes

In [11]:
combined_df = pd.merge(rolling_combined_df, error_featured_df, on=['datetime', 'machineID'], how='outer')
combined_df = pd.merge(combined_df, error_with_ID, on=['datetime', 'machineID'], how='outer')
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37624 entries, 0 to 37623
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   machineID               37624 non-null  int64         
 1   volt                    37624 non-null  float64       
 2   rotate                  37624 non-null  float64       
 3   pressure                37624 non-null  float64       
 4   vibration               37624 non-null  float64       
 5   datetime                37624 non-null  datetime64[ns]
 6   comp                    2886 non-null   object        
 7   time_since_last_comp1   37624 non-null  float64       
 8   time_since_last_comp2   37624 non-null  float64       
 9   time_since_last_comp3   37624 non-null  float64       
 10  time_since_last_comp4   37624 non-null  float64       
 11  errorID                 1125 non-null   object        
 12  time_since_last_error1  37624 non-null  float6

In [12]:
machine_model_df = pd.merge(errors_df, machines_df, on='machineID')

machine_model_df = machine_model_df.drop(['datetime','age','model'],axis=1)

machine_model_df = pd.get_dummies(machine_model_df, columns=['errorID'])

error_features = machine_model_df.groupby(['machineID']).sum().reset_index()

combined_df = pd.merge(combined_df,error_features, on=['machineID'], how='left')

In [13]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37624 entries, 0 to 37623
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   machineID               37624 non-null  int64         
 1   volt                    37624 non-null  float64       
 2   rotate                  37624 non-null  float64       
 3   pressure                37624 non-null  float64       
 4   vibration               37624 non-null  float64       
 5   datetime                37624 non-null  datetime64[ns]
 6   comp                    2886 non-null   object        
 7   time_since_last_comp1   37624 non-null  float64       
 8   time_since_last_comp2   37624 non-null  float64       
 9   time_since_last_comp3   37624 non-null  float64       
 10  time_since_last_comp4   37624 non-null  float64       
 11  errorID                 1125 non-null   object        
 12  time_since_last_error1  37624 non-null  float6

In [14]:
combined_df = pd.merge(combined_df, failures_df, on=['datetime', 'machineID'], how='left')


Label encode all categorical

In [15]:
from sklearn.preprocessing import LabelEncoder

category_mapping = {
    'comp1': 1, 'comp2': 2, 'comp3': 3, 'comp4': 4,
    'error1': 5, 'error2': 6, 'error3': 6, 'error4': 7, 'error5': 8,
    'NoProb': 0
}

# 将NaN替换为特定的标签
combined_df['failure'].fillna('NoProb', inplace=True)
combined_df['errorID'].fillna('NoProb', inplace=True)

combined_df['failure'] = combined_df['failure'].map(category_mapping)
combined_df['errorID'] = combined_df['errorID'].map(category_mapping)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['failure'].fillna('NoProb', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['errorID'].fillna('NoProb', inplace=True)


In [16]:
def merge_labels(row):
    # 获取'NoProb'对应的编码值
    no_prob_code = category_mapping['NoProb']

    # 使用映射后的值来判断
    if row['failure'] != no_prob_code:
        return row['failure']
    elif row['errorID'] != no_prob_code:
        return row['errorID']
    else:
        return no_prob_code  # 返回'NoProb'的编码值

combined_df['label'] = combined_df.apply(merge_labels, axis=1)

In [17]:
new_df = combined_df.drop(['failure','errorID'],axis = 1)
print(combined_df['failure'].unique(),combined_df['errorID'].unique())

[0 1 3 2 4] [0 5 7 6 8]


In [18]:
new_df.describe()


Unnamed: 0,machineID,volt,rotate,pressure,vibration,datetime,time_since_last_comp1,time_since_last_comp2,time_since_last_comp3,time_since_last_comp4,...,errorID_error2_x,errorID_error3_x,errorID_error4_x,errorID_error5_x,errorID_error1_y,errorID_error2_y,errorID_error3_y,errorID_error4_y,errorID_error5_y,label
count,37706.0,37706.0,37706.0,37706.0,37706.0,37706,37706.0,37706.0,37706.0,37706.0,...,37706.0,37706.0,37706.0,37706.0,37706.0,37706.0,37706.0,37706.0,37706.0,37706.0
mean,50.46295,170.863961,445.630498,100.938893,40.425556,2015-07-02 19:44:38.470269952,54.099507,52.3701,53.357927,54.403198,...,0.092293,0.080438,0.06198,0.032833,10.102212,9.886252,8.385138,7.273617,3.563226,0.255556
min,1.0,158.481822,278.093902,91.096429,36.232311,2015-01-01 06:00:00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,5.0,1.0,1.0,0.0,0.0
25%,25.0,168.117352,441.47674,98.706969,39.371864,2015-04-02 06:00:00,14.0,13.0,14.0,14.0,...,0.0,0.0,0.0,0.0,8.0,8.0,7.0,5.0,2.0,0.0
50%,50.0,170.221099,449.117712,100.097619,40.075267,2015-07-03 06:00:00,33.0,30.0,33.0,33.0,...,0.0,0.0,0.0,0.0,10.0,10.0,8.0,7.0,3.0,0.0
75%,75.0,172.431948,456.127018,101.580287,40.821494,2015-10-02 06:00:00,69.0,68.0,68.0,71.0,...,0.0,0.0,0.0,0.0,12.0,11.0,10.0,9.0,5.0,0.0
max,100.0,218.73838,489.699178,152.259183,60.852392,2016-01-01 06:00:00,492.0,349.0,371.0,395.0,...,3.0,3.0,3.0,2.0,20.0,17.0,16.0,18.0,12.0,8.0
std,28.880175,4.937475,20.197592,4.994207,2.170466,,62.626806,59.497942,59.003197,59.721663,...,0.301911,0.281933,0.248809,0.18042,3.299665,2.704141,3.006076,3.174232,2.48509,1.152495


In [19]:
from sklearn.preprocessing import LabelEncoder

models = ['model1','model2','model3','model4']
label_encoder_1 = LabelEncoder()
label_encoder_1.fit(models)

machines_df['model'] = label_encoder_1.fit_transform(machines_df['model'])

merged_df = pd.merge(new_df, machines_df, on='machineID', how='left')

In [20]:
from sklearn.utils import resample

def undersample_non_events(data, label_column, no_event_code, undersample_ratio=1.0):

    positive_class = data[data[label_column] != no_event_code]

    negative_class = data[data[label_column] == no_event_code]

    negative_class_downsampled = resample(negative_class,
                                          replace=False,  # 不进行放回抽样
                                          n_samples=int(len(positive_class) * undersample_ratio),
                                          random_state=42)

    downsampled_data = pd.concat([positive_class, negative_class_downsampled])

    return downsampled_data

no_event_code = 0
undersampled_df = undersample_non_events(merged_df, 'label', no_event_code, undersample_ratio=5)

In [21]:
undersampled_df.drop(['datetime'], axis=1, inplace=True)
undersampled_df.drop(['comp'], axis=1, inplace=True)
undersampled_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13344 entries, 23 to 31066
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   machineID               13344 non-null  int64  
 1   volt                    13344 non-null  float64
 2   rotate                  13344 non-null  float64
 3   pressure                13344 non-null  float64
 4   vibration               13344 non-null  float64
 5   time_since_last_comp1   13344 non-null  float64
 6   time_since_last_comp2   13344 non-null  float64
 7   time_since_last_comp3   13344 non-null  float64
 8   time_since_last_comp4   13344 non-null  float64
 9   time_since_last_error1  13344 non-null  float64
 10  time_since_last_error2  13344 non-null  float64
 11  time_since_last_error3  13344 non-null  float64
 12  time_since_last_error4  13344 non-null  float64
 13  time_since_last_error5  13344 non-null  float64
 14  errorID_error1_x        13344 non-null  fl

# Model Training

In [23]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE


X = undersampled_df.drop(['label',],axis = 1)
y = undersampled_df['label']

In [24]:
from sklearn.preprocessing import MultiLabelBinarizer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

num_class = len(y.unique())

class_counts = {label: sum(y_train == label) for label in range(num_class)}
print(class_counts)

{0: 7789, 1: 179, 2: 270, 3: 122, 4: 188, 5: 155, 6: 403, 7: 105, 8: 129}


In [25]:
sampling_strategy = {label:3*sum(y_train == label) for label in range(1, 9)}

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=30)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
class_counts = {label: sum(y_train_smote == label) for label in range(num_class)}
print(class_counts)

{0: 7789, 1: 537, 2: 810, 3: 366, 4: 564, 5: 465, 6: 1209, 7: 315, 8: 387}


In [26]:
model = xgb.XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=num_class,
    max_depth = 4,
    n_estimators = 300,
    eta = 0.03,
)

model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


[0]	validation_0-mlogloss:2.07137	validation_1-mlogloss:2.07204
[1]	validation_0-mlogloss:1.96126	validation_1-mlogloss:1.96250
[2]	validation_0-mlogloss:1.86286	validation_1-mlogloss:1.86461
[3]	validation_0-mlogloss:1.77440	validation_1-mlogloss:1.77662
[4]	validation_0-mlogloss:1.69389	validation_1-mlogloss:1.69645
[5]	validation_0-mlogloss:1.61985	validation_1-mlogloss:1.62278
[6]	validation_0-mlogloss:1.55175	validation_1-mlogloss:1.55501
[7]	validation_0-mlogloss:1.48841	validation_1-mlogloss:1.49201
[8]	validation_0-mlogloss:1.42954	validation_1-mlogloss:1.43352
[9]	validation_0-mlogloss:1.37433	validation_1-mlogloss:1.37851
[10]	validation_0-mlogloss:1.32264	validation_1-mlogloss:1.32714
[11]	validation_0-mlogloss:1.27387	validation_1-mlogloss:1.27860
[12]	validation_0-mlogloss:1.22799	validation_1-mlogloss:1.23290
[13]	validation_0-mlogloss:1.18445	validation_1-mlogloss:1.18950
[14]	validation_0-mlogloss:1.14318	validation_1-mlogloss:1.14846
[15]	validation_0-mlogloss:1.10410	

In [30]:
MODELS_PATH = os.path.join("..","store","models")
model.save_model(os.path.join(MODELS_PATH,"xgboost.json"))

# Evaluation

In [27]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3331
           1       0.92      0.86      0.89       103
           2       0.87      0.88      0.87       113
           3       0.91      0.91      0.91        65
           4       0.82      0.88      0.85        64
           5       0.94      0.73      0.82        64
           6       0.94      0.91      0.93       165
           7       0.92      0.79      0.85        43
           8       0.94      0.82      0.88        56

    accuracy                           0.98      4004
   macro avg       0.92      0.86      0.89      4004
weighted avg       0.98      0.98      0.97      4004



In [32]:
import numpy as np

y_pred_proba = model.predict_proba(X_test)

categories = ['NoProb', 'comp1', 'comp2', 'comp3', 'comp4', 'error1', 'error2', 'error3&4', 'error5']

for i in range(10):
    sample_probs = y_pred_proba[i]

    max_prob_index = np.argmax(sample_probs)

    predicted_category = categories[max_prob_index]

    max_prob = sample_probs[max_prob_index]

    print(f"sample{i} ：{predicted_category}，：{max_prob:.2f}")

sample0 ：NoProb，：1.00
sample1 ：NoProb，：1.00
sample2 ：NoProb，：1.00
sample3 ：NoProb，：0.98
sample4 ：NoProb，：0.89
sample5 ：NoProb，：1.00
sample6 ：NoProb，：0.96
sample7 ：NoProb，：1.00
sample8 ：NoProb，：1.00
sample9 ：NoProb，：1.00


In [34]:
from sklearn.metrics import accuracy_score
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 97.55%


In [35]:
X_test.head()

Unnamed: 0,machineID,volt,rotate,pressure,vibration,time_since_last_comp1,time_since_last_comp2,time_since_last_comp3,time_since_last_comp4,time_since_last_error1,...,errorID_error3_x,errorID_error4_x,errorID_error5_x,errorID_error1_y,errorID_error2_y,errorID_error3_y,errorID_error4_y,errorID_error5_y,model,age
5587,33,164.490083,434.203971,101.690046,39.978717,9.0,178.0,9.0,24.0,25.0,...,0.0,0.0,0.0,9,10,7,4,7,2,14
9986,17,167.210122,452.690682,98.532404,40.550367,52.0,7.0,22.0,7.0,17.0,...,0.0,0.0,0.0,11,15,10,9,4,0,14
2542,68,173.705672,441.318502,101.880228,41.682257,8.0,43.0,58.0,8.0,7.666667,...,0.0,0.0,0.0,11,8,13,5,1,2,10
4556,30,166.926244,457.042173,124.905513,39.71375,228.0,6.0,198.0,6.0,0.833333,...,0.0,0.0,0.0,12,11,5,7,5,2,20
24840,7,170.21952,463.474487,101.377489,49.593206,113.0,8.0,38.0,113.0,18.75,...,0.0,1.0,0.0,8,13,11,4,3,2,20
