In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE
from matplotlib.ticker import FuncFormatter

In [24]:
# Load data
file_path = 'output.xlsx' # use your own path
data = pd.read_excel(file_path)
data['Time'] = data['Time'].dt.hour # Only retain the hourly data in the time data as features

In [3]:
# Due to the large number of fault free (F0) samples, only 10% of the samples labeled as' F0 'are retained
df_f0 = data[data['Fault'] == 'F0']
df_other = data[data['Fault'] != 'F0']
df_f0_sampled = df_f0.sample(frac=0.1, random_state=42)
df_balanced = pd.concat([df_f0_sampled, df_other])

In [4]:
#Remove categories with low sample size (not exceed 20)
category_counts = df_balanced['Fault'].value_counts() # Calculate the sample size for each category
valid_categories = category_counts[category_counts > 20].index # Filter out categories with a sample size greater than 20
filtered_df = df_balanced[df_balanced['Fault'].isin(valid_categories)] # Filter the original DataFrame and retain categories with a sample size greater than 20

In [5]:
df_balanced = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True) # Disrupt data
# Feature and label separation
X = df_balanced.drop(columns=['Fault'])
y = df_balanced['Fault']

In [6]:
# Evaluation indicators and parameter settings
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro', zero_division=0)
    recall = recall_score(y, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y, y_pred, average='macro', zero_division=0)
    return accuracy, precision, recall, f1

In [7]:
# K-fold cross validation training and evaluation
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Set K-fold cross validation
results = {model: {'accuracy': [], 'precision': [], 'recall': [], 'f1': []} for model in models} # Store the evaluation results of each model

In [8]:
# Model Training and Validation
for model_name, model in models.items():
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train) # Train model
        # validate model
        accuracy, precision, recall, f1 = evaluate_model(model, X_val, y_val)
        results[model_name]['accuracy'].append(accuracy)
        results[model_name]['precision'].append(precision)
        results[model_name]['recall'].append(recall)
        results[model_name]['f1'].append(f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
# Calculate the average evaluation metrics for each model
avg_results = {model: {metric: np.mean(results[model][metric]) for metric in results[model]} for model in models}
avg_results_df = pd.DataFrame(avg_results)
# Output the DataFrame to an Excel file
avg_results_df.to_excel('average_result.xlsx', index=False)

In [10]:
# Create an ExcelWriter object to save multiple sheets
with pd.ExcelWriter('all_result.xlsx') as writer:
    # Traverse every sheet in the dictionary
    for sheet_name, data in results.items():
        # Convert internal dictionary to DataFrame
        results_df = pd.DataFrame(data)
        # Write the DataFrame to the corresponding sheet in Excel
        results_df.to_excel(writer, sheet_name=sheet_name, index=False)

In [11]:
# Draw a confusion matrix heatmap
with pd.ExcelWriter('confusion_matrix.xlsx', engine='openpyxl') as writer:
    for model_name, model in models.items():
        # Select the validation set for the last fold
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        cm = confusion_matrix(y_val, y_pred, labels=model.classes_)
        # Convert the confusion matrix into a DataFrame
        cm_df = pd.DataFrame(cm, index=model.classes_, columns=model.classes_) 
        # Save the confusion matrix of each model to the corresponding sheet, which is named the model name
        cm_df.to_excel(writer, sheet_name=model_name)
        
        print(f"Confusion matrix for {model_name} has been saved to 'confusion_matrix.xlsx' sheet.")   

Confusion matrix for Random Forest has been saved to 'confusion_matrix.xlsx' sheet.
Confusion matrix for SVM has been saved to 'confusion_matrix.xlsx' sheet.
Confusion matrix for KNN has been saved to 'confusion_matrix.xlsx' sheet.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion matrix for Logistic Regression has been saved to 'confusion_matrix.xlsx' sheet.
Confusion matrix for Decision Tree has been saved to 'confusion_matrix.xlsx' sheet.
Confusion matrix for Naive Bayes has been saved to 'confusion_matrix.xlsx' sheet.
