Machine Learning Classification Models for Multi-Product Pipeline Lubricant Oil Flush Categorization Into Pass or Failed Flush Scenarios

In [None]:
#Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential
from sklearn.model_selection import StratifiedKFold, KFold
from xgboost import XGBClassifier
import xgboost as xgb
from xgboost import cv
from xgboost import XGBClassifier, plot_importance

# data upload

link =""
data = pd.read_excel(link)

#data preview
data.columns

#check for missing data
data.isnull().sum()

#label encoding
for i in range(len(data['Prod. Line'])):
  if data.loc[i,'Prod. Line'] == "DF":
    data.loc[i,'Prod. Line'] = 1
  elif data.loc[i,'Prod. Line'] == "1":
    data.loc[i,'Prod. Line'] = 2
  elif data.loc[i,'Prod. Line'] == "2":
    data.loc[i,'Prod. Line'] = 3
  elif data.loc[i,'Prod. Line'] == "3":
    data.loc[i,'Prod. Line'] = 4
  elif data.loc[i,'Prod. Line'] == "4":
    data.loc[i,'Prod. Line'] = 5

#print data after preprocessing
data = pd.get_dummies(data, columns=['System Type'], prefix='System Type')
data = data.dropna(subset=['Source \nTank'], axis=0)

#Data categorization into family types
data1 = data[data['Compatibility'].str.contains('AE', na = False)] #
data1

#X and y data for training
X = data1[['Documented \nFlush','Flush KV40','Flush KV100','Avg Ambient Temp (degF) ','Prod. Line','System Type_2.0', 'System Type_3.0']]
X = X.rename(columns = {"Documented \nFlush": "Documented Flush","Prod. Line":"Drum_Fill"})
X = X.astype({'Drum_Fill':'int'})
Y = data1['Failed']

#check for null
X.isna().sum()

#data transformation
sc = MinMaxScaler(feature_range=(0,1))
scaled_data = sc.fit_transform(X.iloc[:,:-2])
data_to_scale = pd.DataFrame(scaled_data, columns=X.iloc[:,:-2].columns)
frames = [data_to_scale, X.iloc[:,-2:]]

#data splitting for Random Forest (RF)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
RF = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=42)

#Check cross validataion
clf_RF = cross_val_score(RF, X_train, y_train, cv=10)
clf_RF.mean()

#data cross validation through stratified k-fold method
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#feature importances
for fold, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    x_train_fold, x_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    RF = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=42)
    model = RF.fit(x_train_fold, y_train_fold)
    fold_ = fold + 1
    tite = "Feature Importance for Fold " + str(fold_)
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    perm = permutation_importance(model, x_test_fold, y_test_fold, n_repeats=5, random_state=42, scoring=scorer)
    importances = perm.importances_mean
    idxx = np.argsort(importances)
    plt.barh(range(len(idxx)),importances[idxx])
    plt.yticks(range(len(idxx)), [ feature for feature in X_train.columns])
    plt.title(tite)
    filenaem = '/content/drive/MyDrive/' + 'fold_' + str(fold_) + '.png'
    plt.savefig(filenaem, dpi=1400, bbox_inches="tight")
    plt.show()


#mode predictions
model = RF.fit(X_train, y_train)
Y1P_HH = model.predict(X_train)
Y2P_HH = model.predict(X_test)

#print model metrics
print("Accuracy Score (training) = ", accuracy_score(y_train, Y1P_HH), "\t||  R2_score (testing) = ", accuracy_score(y_test, Y2P_HH))

scorer = make_scorer(mean_squared_error, greater_is_better=False)
perm = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=42, scoring=scorer)
importances = perm.importances_mean
idxx = np.argsort(importances)


#confusion matrix plots
plt.figure(figsize=(10,8), dpi=2000)
plt.barh(range(len(idxx)),importances[idxx])
plt.title("Random Forest: Feature Importance",fontsize=40)
plt.xlabel("Importance",fontsize=35)
plt.xticks(fontsize=35)
plt.yticks(range(len(idxx)), [ feature for feature in X.columns])
plt.yticks(fontsize=35)
#plt.savefig("c://downloads", dpi=1400, bbox_inches="tight")
plt.show()


#******************************************#
#XGBoost
#******************************************#
#data matrix
data_dmatrix = xgb.DMatrix(data=X,label=Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the model with specified hyperparameters
model = XGBClassifier(
    alpha=10,
    base_score=0.5,
    booster='gbtree',
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=1.0,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    n_estimators=100,
    n_jobs=1,
    objective='binary:logistic',
    random_state=42,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    subsample=1,
    verbosity=1
)

# Train the model
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

#train parameters
params = {
    "alpha":10,
    "base_score":0.5,
    "booster":"gbtree",
    "colsample_bylevel":1,
    "colsample_bynode":1,
    "colsample_bytree":1,
    "gamma":0,
    "learning_rate":1.0,
    "max_delta_step":0,
    "max_depth":4,
    "min_child_weight":1,
    "n_estimators":100,
    "n_jobs":1,
    "objective":'binary:logistic',
    "random_state":42,
    "reg_alpha":0,
    "reg_lambda":1,
    "scale_pos_weight":1,
    "subsample":1,
    "verbosity":1
}


xgb_cv = cv(dtrain=data_dmatrix, params=params, nfold=10,
                    num_boost_round=50, early_stopping_rounds=10, metrics="auc", as_pandas=True, seed=123)




accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Fail', 'Pass'], yticklabels=['Fail', 'Pass'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


from xgboost import XGBClassifier, plot_importance

plt.figure(figsize=(10, 6))
plot_importance(model, max_num_features=10)
plt.title('Feature Importance')
plt.show()
