In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
#scaling, normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#kmeans, dbscan, hierarchical (sklearn)
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
#evaluation
from sklearn.metrics import silhouette_score
#import dataset
from sklearn.datasets import load_iris

#distance matrix (dbscan elbow, hierarchical)
from scipy.spatial.distance import pdist, squareform
# hierarchical (scipy)
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

In [None]:
df = pd.read_csv("../../our_analyses/dataset_prepared.csv")

In [None]:
def get_linkage_matrix(model):
    # Create linkage matrix 
    
    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    return linkage_matrix

def plot_dendrogram(model, **kwargs):
    linkage_matrix = get_linkage_matrix(model)
    dendrogram(linkage_matrix, **kwargs)

In [None]:

# # Parametri da esplorare
# distance_thresholds = [0.01, 0.05, 0.1, 0.2]
# linkage_types = ['ward', 'complete', 'average', 'single']
# metrics = ['euclidean', 'manhattan', 'cosine']

# best_silhouette_score = -1
# best_params = {}

# for threshold in distance_thresholds:
#     for linkage in linkage_types:
#         for metric in metrics:
#             # Verifica la compatibilità di metric e linkage
#             valid_combination = (metric != 'manhattan' or linkage != 'ward')
#             if valid_combination:
#                 model = AgglomerativeClustering(distance_threshold=threshold, n_clusters=None, metric=metric, linkage=linkage)
#                 clusters = model.fit_predict(X)

#                 # Controllo sul numero di cluster generati
#                 num_clusters = len(np.unique(clusters))
#                 if 2 <= num_clusters <= len(X) - 1:
#                     # Calcolo del punteggio di Silhouette
#                     silhouette_avg = silhouette_score(X, clusters)

#                     # Aggiorna i migliori parametri se necessario
#                     if silhouette_avg > best_silhouette_score:
#                         best_silhouette_score = silhouette_avg
#                         best_params = {
#                             'distance_threshold': threshold,
#                             'linkage': linkage,
#                             'metric': metric,
#                             'num_clusters': num_clusters
#                         }

# print("Migliori parametri:")
# print(best_params)

In [None]:
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None, 
                                metric='euclidean', linkage='complete')
model = model.fit(X_minmax)

In [None]:
#EUCLIDEAN METHOD
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

# Supponendo che df_prepared sia il tuo DataFrame
'''scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_prepared), columns=df_prepared.columns)'''

# Supponendo 'hier.labels_' sia una serie o un array con le etichette del clustering
linkage_types = ['ward', 'complete']
linkage_matrices = {}

# Calcola la matrice di linkage per ciascun tipo di linkage
for linkage_type in linkage_types:
    linkage_matrix = linkage(X, method=linkage_type)
    linkage_matrices[linkage_type] = linkage_matrix

# Plotta i dendrogrammi per ciascun tipo di linkage
plt.figure(figsize=(18, 10))

for i, linkage_type in enumerate(linkage_types, 1):
    plt.subplot(2, 2, i)
    
    # Imposta distance_threshold=None per calcolare l'albero completo.
    model = AgglomerativeClustering(distance_threshold=5, n_clusters=None, affinity='euclidean', linkage=linkage_type)
    labels = model.fit_predict(X)
    
    plt.title(f"Hierarchical Clustering Dendrogram ({linkage_type.capitalize()} Linkage)")
    dendrogram(linkage_matrices[linkage_type], truncate_mode="lastp", p=30, leaf_rotation=90., leaf_font_size=8., show_contracted=True)
    plt.xlabel("Number of points in node (or index of point if no parenthesis).")
    plt.ylabel("Distance")
    
plt.tight_layout()
plt.show()


In [None]:
Z = get_linkage_matrix(model)
labels = fcluster(Z, t=1.4, criterion='distance')

In [None]:
print('Silhouette', silhouette_score(X_minmax, labels))