## Anomaly Detection in a Wastewater Treatment Plant

### Loading the Dataset


In [2]:
from ucimlrepo import fetch_ucirepo, list_available_datasets
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
import matplotlib.pyplot as plt
import io

In [4]:

df = pd.read_csv('data/water-treatment.data', delimiter=',', header=None)
df.replace("?", pd.NA, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df

FileNotFoundError: [Errno 2] No such file or directory: 'data/water-treatment.data'

### a. Perform DBSCAN on the 38-feature data set

In [None]:
X = df.iloc[:, 1:].values
scaler = StandardScaler()
X_scl= scaler.fit_transform(X)

plt.scatter(X_scl[:, 0], X_scl[:, 1], c=labels, cmap='viridis')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster Label')
plt.show()

In [None]:
minPts = 3
eps = 7
dbscan = DBSCAN(eps=eps, min_samples=minPts)
dbscan.fit(X_scl)
labels = dbscan.labels_
outliers_ind = np.where(labels == -1)[0]
outliers = df.iloc[outliers_ind]

print("Num of outliers: ", len(outliers))
print("Outliers:")
print(outliers)

plt.scatter(X_scl[:, 0], X_scl[:, 1], c=labels, cmap='viridis')
plt.scatter(X_scl[outliers_ind, 0], X_scl[outliers_ind, 1], c='red', label='Outliers')

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster Label')
plt.legend()
plt.show()

### b. Perform anomaly detection methods after dimensionality reduction

In [None]:
pca = PCA(n_components=2)
X_scl_pca = pca.fit_transform(X_scl)

In [None]:
# Make a meshgrid for plotting surfaces
Xp, Yp = np.meshgrid(np.linspace(-20.5,20.5),np.linspace(-10,30))
XY = np.vstack([Xp.ravel(), Yp.ravel()]).T

from sklearn.neighbors import KernelDensity
kde = KernelDensity(kernel='gaussian',bandwidth=0.4).fit(X_scl_pca)
Zp = np.exp(kde.score_samples(XY))
Zp = Zp.reshape(Xp.shape)

scores = kde.score_samples(X_scl_pca)
threshold = np.quantile(scores,0.05)
print(f"Threshold (KDE) = {np.exp(threshold)}")

normals = X_scl_pca[scores > threshold,:]
anomals = X_scl_pca[scores <= threshold,:]

cntr = plt.contourf(Xp, Yp, Zp, cmap='viridis')
plt.scatter(normals[:,0], normals[:,1], s=5, color='k', label='Normal Observations')
plt.scatter(anomals[:,0], anomals[:,1], s=5, color='r', label='Anomalies (95% Confidence)')
plt.title('Anomaly Detection using KDE')
plt.colorbar(cntr)
plt.legend()
plt.grid()
plt.show()

In [None]:
ocsvm = OneClassSVM(nu=0.05, gamma=1).fit(X_scl_pca)
Zp = ocsvm.score_samples(XY)
Zp = Zp.reshape(Xp.shape)

# Get the anomalous data points
y_pred = ocsvm.predict(X_scl_pca)
normals = X_scl_pca[y_pred == 1,:]
anomals = X_scl_pca[y_pred == -1,:]

cntr = plt.contourf(Xp, Yp, Zp, levels=50, cmap='viridis')
plt.scatter(normals[:,0], normals[:,1], s=5, color='k', label='Normal Observations')
plt.scatter(anomals[:,0], anomals[:,1], s=5, color='r', label='Anomalies (nu=0.05, gamma=1)')
plt.title('Anomaly Detection using One-Class SVM')
plt.colorbar(cntr)
plt.legend()
plt.grid()
plt.show()