In [None]:
# IMPORTS
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')


In [None]:
# Load the dataset
data = pd.read_csv('../../../datasets/summary_general-2.txt', sep=r'\s*\|\s*', engine='python')

# Display the first few rows of the dataset
print(data.head())

# Remove rows with NaN values
data = data.dropna()

# Create additional features
data['Normalized_T50'] = data['T50'] / data['T90']
data['Log_T50'] = np.log(data['T50'])
data['T50_to_T90'] = data['T50'] / data['T90']

In [None]:
# Prepare the feature matrix for clustering
features = ['T50', 'Normalized_T50', 'Log_T50', 'T50_to_T90']
X = data[features]

In [None]:
# Standardize the feature columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



Option 1: K-means clustering

In [None]:
# Initialize the K-means clustering
kmeans = KMeans(n_clusters=2, random_state=42)

# Fit the model
kmeans.fit(X_scaled)

# Predict the clusters
clusters = kmeans.predict(X_scaled)

# Add the cluster labels to the dataset
data['Cluster'] = clusters

# Evaluate the clustering with silhouette score
silhouette_avg = silhouette_score(X_scaled, clusters)
print(f'Silhouette Score: {silhouette_avg:.2f}')

# Display the updated dataset with cluster labels
#print(data.head())

Option 2: Isolation Forest model

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# Initialize and fit the Isolation Forest model
iso_forest = IsolationForest(contamination=0.1, random_state=42)  # Adjust contamination as needed
clusters_iso = iso_forest.fit_predict(X_scaled)

# Convert -1 (anomalies) to 1 and 1 (inliers) to 0 for easier interpretation
data['Cluster_IsolationForest'] = np.where(clusters_iso == -1, 1, 0)

# Evaluate the results (if you have ground truth labels)
# For this example, we'll assume you don't have ground truth, so this step is optional
# If you had labels, you could evaluate as follows:
# y_true = data['Ground_Truth_Label']  # Replace with actual ground truth labels
# y_pred = data['Cluster_IsolationForest']
# accuracy = accuracy_score(y_true, y_pred)
# print(f'Accuracy: {accuracy:.2f}')
# print(classification_report(y_true, y_pred))

# Display the updated dataset with cluster labels
#print(data.head())


Plotting

In [None]:
# Assuming you have reduced to 2D or have 2 features for visualization
plt.figure(figsize=(10, 6))

# Plot normal data points
plt.scatter(X_scaled[data['Cluster_IsolationForest'] == 0, 0], 
            X_scaled[data['Cluster_IsolationForest'] == 0, 1], 
            c='blue', label='Normal')

# Plot anomalies
plt.scatter(X_scaled[data['Cluster_IsolationForest'] == 1, 0], 
            X_scaled[data['Cluster_IsolationForest'] == 1, 1], 
            c='red', label='Anomaly')

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Isolation Forest Anomaly Detection')
plt.legend()
plt.show()


In [None]:
from sklearn.decomposition import PCA

In [None]:
# Reduce dimensions to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))

# Plot normal data points
plt.scatter(X_pca[data['Cluster_IsolationForest'] == 0, 0], 
            X_pca[data['Cluster_IsolationForest'] == 0, 1], 
            c='blue', label='Normal')

# Plot anomalies
plt.scatter(X_pca[data['Cluster_IsolationForest'] == 1, 0], 
            X_pca[data['Cluster_IsolationForest'] == 1, 1], 
            c='red', label='Anomaly')

plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('2D Projection of Isolation Forest Results')
plt.legend()
plt.show()


In [None]:
# Add anomaly labels to the dataset
data['Anomaly'] = np.where(data['Cluster_IsolationForest'] == 1, 'Anomaly', 'Normal')

# Create a pairplot
sns.pairplot(data, hue='Anomaly', vars=['T50', 'Normalized_T50', 'Log_T50', 'T50_to_T90'], palette={'Anomaly': 'red', 'Normal': 'blue'})
plt.title('Pairplot with Anomalies Highlighted')
plt.show()


In [None]:
# Assuming you have three features, e.g., T50, Normalized_T50, and Log_T50
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Plot normal data points
ax.scatter(X_scaled[data['Cluster_IsolationForest'] == 0, 0], 
           X_scaled[data['Cluster_IsolationForest'] == 0, 1], 
           X_scaled[data['Cluster_IsolationForest'] == 0, 2], 
           c='blue', label='Normal', alpha=0.6)

# Plot anomalies
ax.scatter(X_scaled[data['Cluster_IsolationForest'] == 1, 0], 
           X_scaled[data['Cluster_IsolationForest'] == 1, 1], 
           X_scaled[data['Cluster_IsolationForest'] == 1, 2], 
           c='red', label='Anomaly', alpha=0.8)

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')
plt.title('3D Plot of Isolation Forest Results')
plt.legend()
plt.show()


In [None]:
# Compute anomaly scores
anomaly_scores = iso_forest.decision_function(X_scaled)

plt.figure(figsize=(10, 6))
plt.hist(anomaly_scores[data['Cluster_IsolationForest'] == 0], bins=30, alpha=0.6, color='blue', label='Normal')
plt.hist(anomaly_scores[data['Cluster_IsolationForest'] == 1], bins=30, alpha=0.6, color='red', label='Anomaly')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.title('Histogram of Anomaly Scores')
plt.legend()
plt.show()
