### Model Implementation

### Fuzzy C-Means

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import skew
import skfuzzy as fuzz
import matplotlib.pyplot as plt




# Load dataset into a pandas DataFrame
df = pd.read_csv('Final Dataset.csv')


# Extract numerical features
numerical_data = df.drop(columns=['collection_logical_name' , 'collection_name' , 'user' , 'collections_events_type' , 'machine_id' , 
                                  'instance_index' , 'alloc_collection_id' , 'collection_type' , 'collection_id' , 'instance_events_type' , 'Unnamed: 0.1'])


# Convert to numpy array
numerical_array = numerical_data.values


# Fuzzy clustering
n_clusters = 5  # You can adjust this as needed
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(numerical_array.T, n_clusters, 2, error=0.005, maxiter=1000)

# Assign each data point to the cluster with highest membership degree
cluster_membership = np.argmax(u, axis=0)




# Scatter plot of clusters
plt.figure(figsize=(8, 6))
plt.scatter(numerical_array[:, 0], numerical_array[:, 1], c=cluster_membership, cmap='viridis', s=50, alpha=0.5)
plt.title('Fuzzy C-Means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()



(https://github.com/user-attachments/assets/64615040-166a-42c4-8451-a385fc1361e7)

### MiniBatchKMeans

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from joblib import Parallel, delayed




# Load pre-processed dataset into a pandas DataFrame
df = pd.read_csv('Final Dataset.csv')

# Extract numerical features
numerical_data = df.drop(columns=['collection_logical_name', 'collection_name', 'user', 'collections_events_type', 'machine_id',
                                  'instance_index', 'alloc_collection_id', 'collection_type', 'collection_id', 'instance_events_type', 'Unnamed: 0.1'])

# Handle missing values by imputing with mean values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(numerical_data)

# Define a function to fit MiniBatchKMeans with n_jobs=-1
def fit_kmeans(X):
    kmeans = MiniBatchKMeans(n_clusters=3, batch_size=100, n_init=3, init='k-means++')
    kmeans.fit(X)
    return kmeans

# Train MiniBatchKMeans clustering model with multi-core processing using joblib
kmeans = Parallel(n_jobs=-1)(delayed(fit_kmeans)(X_imputed) for _ in range(10))

# Use PCA to reduce dimensionality for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_imputed)



# Visualize the clustering results without centroid labels
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans[0].labels_, cmap='viridis')
# plt.scatter(kmeans[0].cluster_centers_[:, 0], kmeans[0].cluster_centers_[:, 1], marker='x', s=100, c='red')
plt.title('MiniBatchKMeans Clustering with Multi-Core Processing')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


(https://github.com/user-attachments/assets/072c561b-8efe-4701-909d-84d92948914e)

### OpenMP - numba

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from numba import njit, prange
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer


# Load pre-processed dataset into a pandas DataFrame
df = pd.read_csv('Final Dataset.csv')

# Extract numerical features
numerical_data = df.drop(columns=['collection_logical_name', 'collection_name', 'user', 'collections_events_type', 'machine_id',
                                  'instance_index', 'alloc_collection_id', 'collection_type', 'collection_id', 'instance_events_type', 'Unnamed: 0.1'])

# Impute missing values in the numerical features
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(numerical_data)

# Standardize the imputed numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)


# Perform dimensionality reduction using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Define the number of clusters
n_clusters = 4



@njit(parallel=True)
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

@njit(parallel=True)
def kmeans_parallel(X, n_clusters, max_iter=300):
    n_samples, n_features = X.shape
    centers = X[np.random.choice(n_samples, n_clusters, replace=False)]
    labels = np.zeros(n_samples, dtype=np.int64)
    
    for _ in prange(max_iter):
        new_centers = np.zeros((n_clusters, n_features))
        counts = np.zeros(n_clusters)
        
        for i in prange(n_samples):
            dists = np.zeros(n_clusters)
            for j in prange(n_clusters):
                dists[j] = euclidean_distance(centers[j], X[i])
            label = np.argmin(dists)
            labels[i] = label
            new_centers[label] += X[i]
            counts[label] += 1
        
        for j in prange(n_clusters):
            if counts[j] > 0:
                centers[j] = new_centers[j] / counts[j]
    
    return labels


# Perform KMeans clustering on the data in parallel
cluster_labels = kmeans_parallel(X_pca, n_clusters)

# Add the cluster labels to the original DataFrame
df['cluster_label'] = cluster_labels

# Visualize the clustered data
plt.figure(figsize=(10, 6))
for cluster in range(n_clusters):
    plt.scatter(X_pca[cluster_labels == cluster, 0], X_pca[cluster_labels == cluster, 1], label=f'Cluster {cluster}')
plt.title('KMeans Clustering with PCA Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()



(https://github.com/user-attachments/assets/a0d97f47-a147-4f07-b223-43928745a889)

### BIRCH

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import Birch
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# Load pre-processed dataset into a pandas DataFrame
df = pd.read_csv('Final Dataset.csv')

# Extract numerical features
numerical_data = df.drop(columns=['collection_logical_name', 'collection_name', 'user', 'collections_events_type', 
                                   'machine_id', 'instance_index', 'alloc_collection_id', 'collection_type', 
                                   'collection_id', 'instance_events_type', 'Unnamed: 0.1'])

# Handle missing values by imputing with mean (you can choose a different strategy if needed)
imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = imputer.fit_transform(numerical_data)

# Standardize the imputed numerical features
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data_imputed)

# Initialize the BIRCH clustering algorithm
birch = Birch(threshold=0.5, branching_factor=50, n_clusters=None)

# Fit the BIRCH model on the scaled numerical data
birch.fit(numerical_data_scaled)

# Get the cluster labels for each data point
cluster_labels = birch.labels_

# Add the cluster labels to the original DataFrame
df['cluster_label'] = cluster_labels

# Apply PCA to reduce data to 2 dimensions for visualization
pca = PCA(n_components=2)
pca_data = pca.fit_transform(numerical_data_scaled)

# Plot the data points colored by cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=cluster_labels, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('BIRCH Clustering Visualization')
plt.colorbar(label='Cluster Label')
plt.show()





(https://github.com/user-attachments/assets/2973345f-1f8f-4330-bb06-f61ee258a698)