Optimizing Equipment Performance through Predictive Maintenance Strategies

In [1]:
# Importing librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, recall_score, f1_score

In [2]:
# Turning off warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Importing the dataset
url = 'https://raw.githubusercontent.com/kueyram/Predictive-Maintenance/main/Data/machine_failure.csv'

# Reading the dataset into pandas
pred_maintenance_df = pd.read_csv(url, sep=',')

# Printing the columns of the dataset
pred_maintenance_df.columns

HTTPError: HTTP Error 404: Not Found

In [None]:
# Let's inspect the dataset and check its structure and contents
pred_maintenance_df.info()

In [None]:
pred_maintenance_df.describe()

In [None]:
# Printing 5 random columns from the dataset
pred_maintenance_df.sample(n=5)

In [None]:
# Let's check for duplicates

# Checking for duplicates across all columns
duplicates = pred_maintenance_df[pred_maintenance_df.duplicated()]

# Checking if duplicates dataframe is empty
if duplicates.empty:
    print("No duplicates found.")
else:
    print("Duplicates found:")
    print(duplicates)

In [None]:
# Checking for missing values

missing_values = pred_maintenance_df.isnull().any()

# Checking if any column contains NaN
if missing_values.any():
    print("NaN values found in the following columns:")
    print(missing_values[missing_values].index.tolist())
else:
    print("No missing values found in the Dataset.")

Creating visualizations

In [None]:
# Let's extract the columns we need for the chart
air_temperature = pred_maintenance_df['Air temperature [K]']
process_temperature = pred_maintenance_df['Process temperature [K]']

# Line chart
plt.plot(air_temperature, process_temperature, marker='o')
plt.xlabel('Air Temperature (K)')
plt.ylabel('Process Temperature (K)')
plt.title('Air Temperature vs. Process Temperature')
plt.grid(True)
plt.show()

The chart shows that there is a positive correlation between the air temperature and the process temperature. We can conclude that environmental factors have impacts on the operations.

In [None]:
# Creating the histogram for Process Temperature
plt.figure(figsize=(8, 6))
sns.histplot(pred_maintenance_df['Process temperature [K]'], bins=30, kde=True, color='orange')
plt.xlabel('Process Temperature [K]')
plt.ylabel('Frequency')
plt.title('Process Temperature')
plt.tight_layout()
plt.show()

The process Temperature is normally distributed

In [None]:
# Removing columns not needed
# To predict maintenance, we do not need Product ID. Therefore, we are dropping the columns

# Removing the 'Product ID' column
pred_maintenance_df.drop(['Product ID'], axis=1, inplace=True)

# Print list of columns
pred_maintenance_df.columns

Feature transformation: We will convert Air temperature and Process temperature from Kelvin to Fahrenheit

In [None]:
# Function to convert temperature from kelvin to Fahrenheit
def kelvin_to_f(temperature_kelvin):
    temperature_f = ((temperature_kelvin-273.15)*9/5)+32
    return temperature_f

# Let's create a new columns called Air temperature [F]
pred_maintenance_df['Air temperature [F]'] = pred_maintenance_df['Air temperature [K]'].apply(kelvin_to_f)

# Let's create a new columns called Process temperature [F]
pred_maintenance_df['Process temperature [F]'] = pred_maintenance_df['Process temperature [K]'].apply(kelvin_to_f)

# Let's now drop the columns 'Air temperature [K]' and Process temperature [K]
pred_maintenance_df.drop(['Air temperature [K]', 'Process temperature [K]'], axis=1, inplace=True)

# Printing the first 5 rows of the dataset
pred_maintenance_df.head(5)

Scaling the numerical columns using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Columns to scale
numeric_columns = ['Process temperature [F]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Air temperature [F]']

# Fitting the scaler
scaler.fit(pred_maintenance_df[numeric_columns])

# Transforming the columns usinf the fitted scaler
pred_maintenance_df[numeric_columns] = scaler.transform(pred_maintenance_df[numeric_columns])

# Printing the first 5 rows of the dataset
pred_maintenance_df.head(5)

Creating new features:
- Power consumption: Energy used by equipment (rotational speed * torque)
- Temperature difference: Temperature variation between air temperature and process temperature
- Temperature ratio: Relationship between air temperature and process temperature   

In [None]:
# Power consumption
pred_maintenance_df['Power consumption [W]'] = pred_maintenance_df['Rotational speed [rpm]'] * pred_maintenance_df['Torque [Nm]']

# Difference between temperatures
pred_maintenance_df['Temperature difference [F]'] = pred_maintenance_df['Air temperature [F]'] - pred_maintenance_df['Process temperature [F]']

# Ratio of temperatures
pred_maintenance_df['Temperature ratio'] = pred_maintenance_df['Air temperature [F]'] / pred_maintenance_df['Process temperature [F]']

# Printing the first 5 rows of the dataset
pred_maintenance_df.head(5)

In [None]:
# Checking for missing values
missing_values = pred_maintenance_df.isnull()
total_missing_per_column = missing_values.sum()
print(total_missing_per_column)

The columns Temperature ration has 2 missing values

In [None]:
# Finding rows with missing values
rows_with_missing_values = pred_maintenance_df[pred_maintenance_df.isnull().any(axis=1)]

# Printing rows with missing values
rows_with_missing_values

In [17]:
# Replacing the  NaN values with 0
pred_maintenance_df['Temperature ratio'].fillna(0, inplace=True)

##### Creating dummy variables
The columns Type is a categorical columns and has different values. We will create dummy variables for the Type column. This will help the model to understand the categorical relationship between the product Type and potential failures.

In [18]:
# Creating dummy variables for Type
dummy_variables = pd.get_dummies(pred_maintenance_df['Type'], prefix='Type')

# Let's concatenate the dummy variables with the dataset
predictive_maintenance_with_dummies = pd.concat([pred_maintenance_df, dummy_variables], axis=1)

In [None]:
# Printing the first few rows of the new dataset
predictive_maintenance_with_dummies.head(5)

In [None]:
predictive_maintenance_with_dummies.head(5)

In [None]:
# Distribution of key features
plt.figure(figsize=(14, 10))
features = ['Process temperature [F]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Air temperature [F]']
for i, feature in enumerate(features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(pred_maintenance_df[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

These graphs shows that the key features are relatively normally distributed

In [None]:
# Filtering the dataset to exclude "No Failure"
filtered_df = pred_maintenance_df[pred_maintenance_df['Failure Type'] != 'No Failure']

# Creating a horizontal bar chart for the filtered data
plt.figure(figsize=(12, 6))
sns.countplot(y='Failure Type', data=filtered_df, palette='viridis')
plt.title('Distribution of failure types')
plt.xlabel('Count')
plt.ylabel('Failure Type')
plt.show()

The graph shows that Power and heat Dissipation are the main causes of equipment failure.

#### Creating the models
We are going to build a two-stage model. We will first predict wether or not a failure will occure and then if a failure occurs, we will predict the type
Let's add a new column called Failure. Its value will be 0 if No Failure and 1 otherwise

In [23]:
pred_maintenance_df['Failure'] = pred_maintenance_df['Failure Type'].apply(lambda x: 0 if x == 'No Failure' else 1)

We want to be able to predict if a failure will occur and what type of failure. We will use a combination of Logistic Regression for binary classification (failure/non-failure) and Random Forest for multiclass classification (failure types). Logistic regression will help us with the probability of failure and the Random Forest will help with predicting the type of failures

In [24]:
# Let's drop the target columns before creating the dummy variables
pred_maintenance_df_dropped = pred_maintenance_df.drop(columns=['Failure Type', 'Failure'])

# Let's convert categorical variables to dummy variables
pred_maintenance_df_converted = pd.get_dummies(pred_maintenance_df_dropped)

In [None]:
pred_maintenance_df.columns

In [26]:
# Features and target
features = pred_maintenance_df_converted
target = pred_maintenance_df['Failure']

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=0)

Logistic Regression

In [None]:
# Let's initialize and train a logistic regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(features_train, target_train)

# Let's predict and evaluate our model
logreg_predictions = logreg_model.predict(features_test)

print("Logistic Regression")
print(classification_report(target_test, logreg_predictions))

In [None]:
# Let's calculate the accuracy
accuracy = accuracy_score(target_test, logreg_predictions)
# Printing the accuracy
print("Accuracy:", accuracy)

# Precision
precision = precision_score(target_test, logreg_predictions, average='weighted')
print("Precision:", precision)

# Recall
recall = recall_score(target_test, logreg_predictions, average='weighted')
print("Recall:", recall)

#F1-Score
f1 = f1_score(target_test, logreg_predictions, average='weighted')
print("F1-Score:", f1)

Multiclass Classification

In [34]:
# Let's define the target for the multiclass classification
target_multiclass = pred_maintenance_df['Failure Type']

# SPlitting the data
features_train_mc, features_test_mc, target_train_mc, target_test_mc = train_test_split(features, target_multiclass, test_size=0.2, random_state=0)

In [None]:
# Let's train and evaluate a random forest model RFM
from sklearn.ensemble import RandomForestClassifier

# Initilization
rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(features_train_mc, target_train_mc)

# Prediction and evaluating the model
rf_predictions = rf_model.predict(features_test_mc)
print("Random Forest Model")
print(classification_report(target_test_mc, rf_predictions))

In [36]:
# Extracting feature importances
importances = rf_model.feature_importances_

# Creating a dataframe for the feature importances
feature_names = features.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Graph of the feature importance
importance_df_no_target = importance_df[importance_df['Feature'] != 'Target']

# Plotting the filtered feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df_no_target, palette='viridis')
plt.title('Random Forest Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
# Making predictions with the Random Forest model
rf_predictions = rf_model.predict(features_test_mc)

# Calculate Accuracy
rf_accuracy = accuracy_score(target_test_mc, rf_predictions)
print("Accuracy:", rf_accuracy)

# Precision
rf_precision = precision_score(target_test_mc, rf_predictions, average='macro')
print("Precision:", rf_precision)

# Recall
rf_recall = recall_score(target_test_mc, rf_predictions, average='macro')
print("Recall:", rf_recall)

# F1-score
rf_f1 = f1_score(target_test_mc, rf_predictions, average='macro')
print("F1-Score:", rf_f1)
