In [1]:
import pandas as pd
import numpy as np
import datetime

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_pickle('/content/drive/MyDrive/Pump_degradation/pump_degradation.plk')
# df = df.iloc[:1000]

In [4]:
df.head()

Unnamed: 0,w,Q,Tt,Tr,To,A,rThrust,rRadial,wA,wThrust,...,duration,ImpellerWearFailure,ThrustBearingOverheat,RadialBearingOverheat,PumpOilOverheat,ImpellerWearFailure_after,ThrustBearingOverheat_after,RadialBearingOverheat_after,PumpOilOverheat_after,breaks_after_action
0,376.991118,0.0,290.0,290.0,290.0,12.7084,1e-06,2e-06,0.01,0.0,...,2520,1.0,1.0,1.0,1.0,0.906316,0.684101,0.709982,0.652803,0
1,174.411584,-0.055861,315.271959,313.201469,310.831825,12.407825,1e-06,2e-06,0.01,6.5753e-11,...,2520,0.906316,0.684101,0.709982,0.652803,0.986292,0.992862,0.989383,0.984322,0
2,174.411584,-0.055962,315.662622,313.804471,311.445886,12.367965,1e-06,2e-06,0.01,1.022807e-10,...,12060,0.986292,0.992862,0.989383,0.984322,0.891135,0.771285,0.76986,0.671001,0
3,174.411586,-0.056743,328.090408,326.737291,324.130133,12.055743,1e-06,2e-06,0.01,7.104586e-11,...,5040,0.891135,0.771285,0.76986,0.671001,0.850887,1.08898,1.057373,1.112297,0
4,174.41159,-0.057678,324.361281,324.255199,321.225016,11.674649,1e-06,2e-06,0.01,1.24375e-10,...,6300,0.850887,1.08898,1.057373,1.112297,0.896115,0.797745,0.807321,0.708454,0


In [5]:
X = df.drop(['ImpellerWearFailure_after', 'ThrustBearingOverheat_after',	'RadialBearingOverheat_after',	'PumpOilOverheat_after', 'breaks_after_action'], axis=1)
y = df['breaks_after_action'].astype('int64')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# scaling necessary for distance based classifier
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid to search over
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11,15,21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Create a KNN classifier object
knn = KNeighborsClassifier()

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(knn, param_grid=param_grid, cv=5, scoring=scoring, n_jobs=-1, refit='f1_score')
grid_search.fit(X_scaled, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_knn.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best F1 score:  0.6603874982789932
Standard deviation for F1 score:  0.017782325186709094
Accuracy score:  0.9557255447380623
Standard deviation for accuracy:  0.004159021393716568


In [None]:
from sklearn.svm import SVC

# Define the parameter grid to search over
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

# Create an SVM classifier object
svm = SVC()

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(svm, param_grid=param_grid, cv=5, scoring=scoring, n_jobs=-1, refit='f1_score')
grid_search.fit(X_scaled, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_svc.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best F1 score:  0.7891935348827197
Standard deviation for F1 score:  0.08312697921072984
Accuracy score:  0.9643141929929134
Standard deviation for accuracy:  0.005558684421042706


In [None]:
from sklearn.naive_bayes import GaussianNB

# Define the parameter grid to search over
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

# Create a Naive Bayes classifier object
bayes = GaussianNB()

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(bayes, param_grid=param_grid, cv=5, scoring=scoring, n_jobs=-1, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_GNB.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'var_smoothing': 1e-09}
Best F1 score:  0.5513186237164476
Standard deviation for F1 score:  0.15234032954350638
Accuracy score:  0.6584383071726604
Standard deviation for accuracy:  0.22041624785813227


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid to search over
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20,30],
    'min_samples_split': [2, 5, 10,15],
    'min_samples_leaf': [1, 2, 4, 6, 10],
    'max_features': ['sqrt', 'log2', None]
}

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}


# Create a decision tree classifier object
dtc = DecisionTreeClassifier()

# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(dtc, param_grid=param_grid, cv=5, scoring=scoring, n_jobs=-1, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_tree.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 6, 'min_samples_split': 2}
Best F1 score:  0.7876796734177663
Standard deviation for F1 score:  0.024072174730487977
Accuracy score:  0.9629213027352806
Standard deviation for accuracy:  0.0030487716812774516


In [None]:
from xgboost import XGBClassifier

# Define the parameter grid for the XGBoost classifier
param_grid = {
    'n_estimators': [25, 50, 100, 150, 1500],
    'max_depth': [1,  3,  5, 10, 20],
    'learning_rate': [0.1, 0.01, 0.001, 0.0001]
}

# Create the XGBoost classifier
xgb = XGBClassifier()

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

# Perform grid search using cross-validation
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, scoring=scoring, refit=False)
# grid_search.fit(X, y)

grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=5, n_jobs=-1, scoring=scoring, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_xgb.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 25}
Best F1 score:  0.795321654382881
Standard deviation for F1 score:  0.015709491244198643
Accuracy score:  0.9676721637194515
Standard deviation for accuracy:  0.0019820601423860724


In [None]:
from xgboost import XGBClassifier

# Define the parameter grid for the XGBoost classifier
param_grid = {
    'n_estimators': [2500, 5000],
    'max_depth': [1,  5, 20],
    'learning_rate': [0.1, 0.01, 0.001],#, 0.0001]
}

# Create the XGBoost classifier
xgb = XGBClassifier()

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}

# Perform grid search using cross-validation
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1, scoring=scoring, refit=False)
# grid_search.fit(X, y)

grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=3, n_jobs=-1, scoring=scoring, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_xgb.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

In [None]:
# Define the parameter grid to search over
param_grid = {
    # 'criterion': ['gini', 'entropy'],
    'n_estimators': [5,10,25,50],
    'max_depth': [1,2, 3, 4, 5],
    # 'min_samples_split': [2, 3, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['sqrt', 'log2', None]
}

# Create a random forest classifier object
rf = RandomForestClassifier(random_state=42)

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}


# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring=scoring, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_rf.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best F1 score:  0.7927078016646268
Standard deviation for F1 score:  0.004571307149731765
Accuracy score:  0.968868577492905
Standard deviation for accuracy:  0.0002124561887287072


In [9]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [4,5,6],
    'max_depth': [3,4],
}

# Create a random forest classifier object
rf = RandomForestClassifier(random_state=42)

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}


# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=15, n_jobs=-1, scoring=scoring, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_rf.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'max_depth': 4, 'n_estimators': 6}
Best F1 score:  0.7937090949290219
Standard deviation for F1 score:  0.002076190424913041
Accuracy score:  0.9687064267008189
Standard deviation for accuracy:  0.00010451360066249062


In [None]:
# Define the parameter grid to search over
param_grid = {
    # 'criterion': ['gini', 'entropy'],
    'n_estimators': [2500, 5000],
    'max_depth': [3, 4, 5,10],
    # 'min_samples_split': [2, 3, 5, 10],
    # 'min_samples_leaf': [1, 2, 4],
    # 'max_features': ['sqrt', 'log2', None]
}

# Create a random forest classifier object
rf = RandomForestClassifier(random_state=42)

# Define evaluation metrics
scoring = {
    'f1_score': make_scorer(f1_score),
    'accuracy': make_scorer(accuracy_score)
}


# Create a GridSearchCV object and fit it to the data
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring=scoring, refit='f1_score')
grid_search.fit(X, y)

# Create a dataframe from the cv_results_ attribute
results_df = pd.DataFrame(grid_search.cv_results_)

# Save the dataframe to a CSV file
# results_df.to_csv('results/grid_search_results_all_features_all_labels_rf.csv', index=False)

# Print the best hyperparameters and scores
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)
print("Standard deviation for F1 score: ", np.std(grid_search.cv_results_['mean_test_f1_score']))
print("Accuracy score: ", grid_search.cv_results_['mean_test_accuracy'].mean())
print("Standard deviation for accuracy: ", np.std(grid_search.cv_results_['mean_test_accuracy']))

Best parameters:  {'max_depth': 5, 'n_estimators': 5000}
Best F1 score:  0.7880632045231792
Standard deviation for F1 score:  0.0034863665832573516
Accuracy score:  0.9691734551957084
Standard deviation for accuracy:  0.0002569544437984789
