# Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
import skfuzzy as fuzz
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import jaccard_score, silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import jaccard
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Reading database 

In [None]:

current_directory = os.getcwd()

# Define file paths for the datasets
file_path_training = os.path.join(current_directory, 'training_dataset.csv')
file_path_prediction = os.path.join(current_directory, 'prediction_dataset.csv')

# Read training dataset
df_training = pd.read_csv(file_path_training)

# Remove the 'Cluster' column from the training dataset
df_training.drop('Cluster', axis=1, inplace=True)

# Read prediction dataset
df_prediction = pd.read_csv(file_path_prediction)


df = pd.concat([df_training, df_prediction], ignore_index=True)



# Record the starting index of the prediction dataset rows
prediction_start_index = len(df_training)

# Defining cluster variables and scaling dataset

In [None]:
X = df[['spacing', 'interconnectivity', 'design', 'directness', 'service_area']].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Clustering and deriving performance metrics

In [None]:
#optimum parameters 
n_clusters = 5  
m = 1.3  



def kmeans_plusplus_initializer(X, n_clusters):
    n_samples, n_features = X.shape
    centroids = np.zeros((n_clusters, n_features))
    # Randomly choose the first centroid from the data points
    initial_centroid_index = np.random.randint(n_samples)
    centroids[0] = X[initial_centroid_index]

    # Compute distances from the first centroid chosen to all the other data points
    distances = np.linalg.norm(X - centroids[0], axis=1)
    
    for i in range(1, n_clusters):
        # Choose next centroid with probability proportional to the square of the distance
        probabilities = distances**2
        probabilities /= probabilities.sum()
        centroid_index = np.random.choice(n_samples, p=probabilities)
        centroids[i] = X[centroid_index]

        # Update distances after adding the new centroid
        new_distances = np.linalg.norm(X - centroids[i], axis=1)
        distances = np.minimum(distances, new_distances)
    
    return centroids

# Initialize centroids using k-means++
initial_centroids = kmeans_plusplus_initializer(X_scaled, n_clusters)


np.random.seed(42)
# Configure FCM with the initialized centroids
fcm = FCM(n_clusters=n_clusters, m=m)
fcm.fit(X_scaled)

# Get the resulting labels and centers
labels = fcm.u.argmax(axis=1)
centers = fcm.centers

# Evaluation metrics
silhouette = silhouette_score(X_scaled, labels)
db_index = davies_bouldin_score(X_scaled, labels)
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)

# Calculate Average Shortest Euclidean Distance between centers
distances = np.linalg.norm(centers[:, np.newaxis] - centers, axis=2)
np.fill_diagonal(distances, np.inf)
avg_shortest_distance = np.min(distances)

# Stability using Adjusted Rand Index
# Note: Ensure you have a previous run or a baseline to compare with, or simulate another run here
previous_labels = np.copy(labels)  # Simulating previous run for example purposes
fcm.fit(X_scaled)  # Another run (simulating for stability check)
new_labels = fcm.u.argmax(axis=1)
stability_ari = adjusted_rand_score(previous_labels, new_labels)

# Output the results
print("Evaluation Metrics:")
print(f"Silhouette Score: {silhouette}")
print(f"Davies-Bouldin Score: {db_index}")
print(f"Calinski-Harabasz Score: {calinski_harabasz}")
print(f"Average Shortest Euclidean Distance: {avg_shortest_distance}")
print(f"Stability ARI: {stability_ari}")
print("Cluster Centers:\n", centers)




# Reporting the clusters

In [None]:
# Add cluster labels only to the rows from the prediction dataset
df_prediction['Cluster'] = labels[prediction_start_index:]


print(df_prediction)

# Calculate summary statistics for each cluster - optional
cluster_summary = df.groupby('Cluster').agg({
    'spacing': ['mean', 'std', 'min', 'max'],
    'interconnectivity': ['mean', 'std', 'min', 'max'],
    'design': ['mean', 'std', 'min', 'max'],
    'directness': ['mean', 'std', 'min', 'max'],
    'service_area': ['mean', 'std', 'min', 'max'],
    'Cluster': 'count'
})
