# Profiling and Analysis Questions


Clustering
https://web.ist.utl.pt/rmch/dash/guides/Clustering%20in%20Python.html


- what are the top aisle purchase clusters?
- what are the most common timeframe order time clusters (order_dow, order_hour_of_day,days_since_prior_order and weeks_since_prior_order) ?

PCA Resources
- https://www.kaggle.com/code/asindico/customer-segments-with-pca
- https://www.datacamp.com/tutorial/principal-component-analysis-in-python
- https://www.youtube.com/watch?v=8klqIM9UvAc
- https://www.youtube.com/watch?v=FD4DeN81ODY
- https://www.youtube.com/watch?v=HMOI_lkzW08


# NOTAS PROF
- DETETAR E EXCLUIR OUTLIERS DOS CLUSTERS PEQUENOS 
- ADICIONAR DENDROGRAMA DOS CLUSTERS
- EXPERIMENTAR DBSCAN
- ADICIONAR E VALIDAR CENTROIDES

# File and libraries

In [5]:
import pandas as pd
import numpy as np


filepath=r'/Users/cozmaeug/Private/IST PG - DS/DaSH ENG/ist_dash_2024_rec/non_supervised_analysis/notebooks/dataset_2/df_bakery_encoded.csv'

file_tag = "Bakery Pattern Mining"

data = pd.read_csv(filepath)


In [6]:
%run "scripts/dslabs_functions.py"

In [7]:
%run "scripts/data_functions.py"

data_functions lodaded


# Dropping MV

In [8]:
data_copy=data.copy()
data_copy = data_copy.dropna(axis=0, how="any") #axis=0 tells dropna to remove rows that have at least one NaN value.

# Scaling and normalizing data
K-Means is sensitive to scale. If your dataset has different units (e.g., money, time, percentages), consider standardizing it before clustering.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_copy)  # Apply scaling

# [prof] Agglomerative Clustering

## Manhattan distance
- It sums absolute differences across features.
- Each feature has the same weight (1), meaning all are treated equally

In [78]:
from sklearn import cluster
from sklearn.metrics import pairwise_distances

def mydistance(x1, x2):
    res = 0
    fix_weight = 1
    for j in range(len(x1)):
        res += fix_weight*abs(x1[j]-x2[j])
    return res

def affinity(X):
    return pairwise_distances(X, metric=mydistance)

In [90]:
hier_algo = cluster.AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='average')
hier_model = hier_algo.fit(data_copy)
hier_model.labels_


array([0, 0, 0, ..., 0, 0, 0])

## [prof] Customized distance
Weighted Manhattan Distance: Still based on Manhattan Distance, but now some features contribute more than others:
Instead of treating all features equally, it assigns different importance:

- Feature 1 → weight = 1
- Feature 2 → weight = 2
- Feature 3 → weight = 3
- Feature 4 → weight = 1

In [89]:
from sklearn.metrics import pairwise_distances

def mydistance(x1, x2):
    res = 0.0001 #Avoids zero distances when points are identical; Helps prevent division-by-zero issues in clustering.
    for j, weight in enumerate([1,2,3,1]):
        res += weight*abs(x1[j]-x2[j])
    return res

def sim_affinity(X):
    return pairwise_distances(X, metric=mydistance)


In [80]:
hier_algo = cluster.AgglomerativeClustering(n_clusters=5, affinity=affinity, linkage='complete')
hier_model = hier_algo.fit(data_copy)
hier_model.labels_

array([1, 1, 2, ..., 1, 1, 1])

In [81]:
from sklearn import metrics
print("Silhouette:",metrics.silhouette_score(data_copy, hier_model.labels_))

Silhouette: 0.74831119558551


## [prof] Evaluating the clustering solution using internal indices



In [91]:
from sklearn import metrics
y_pred = kmeans_model.labels_
print("Silhouette:",metrics.silhouette_score(data_copy, y_pred))
print("Silhouette per instance:\n",metrics.silhouette_samples(data_copy, y_pred)[:5],"...")
print("Sum of squared distances:",kmeans_model.inertia_)


Silhouette: 0.7006085605777561
Silhouette per instance:
 [0.48101275 0.84371167 0.39126744 0.82414381 0.84097135] ...
Sum of squared distances: 62018680949.56879


In [82]:
data_copy['cluster']=hier_model.labels_


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.boxplot(x="cluster", y='total', data=data_copy)      
plt.show()

# K MEANS clustering

## Elbow Method Study

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator

# Define the range of clusters to evaluate
range_n_clusters = list(range(1, 11))

# Initialize an empty list to store the sum of squared distances
sse = []

# Loop over the range of clusters
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data_copy)
    sse.append(kmeans.inertia_)

# Determine the elbow point using KneeLocator
kneedle = KneeLocator(range_n_clusters, sse, curve='convex', direction='decreasing')
elbow_point = kneedle.elbow

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, sse, marker='o')
plt.title(f'{file_tag} | Elbow Method for Optimal KMeans Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Errors')

# Add a vertical line at the elbow point
plt.axvline(x=elbow_point, color='r', linestyle='--')

plt.show()

# Print the Optimal K After Elbow Method
print(f"Optimal number of clusters (Elbow Method): {elbow_point}")

## Kmeans model

In [None]:
# Clustering
from sklearn import cluster, mixture

# Starting from Scikit-learn 1.4, n_init='auto' is recommended for K-Means - it prevents future compatibility issues.

bakery_kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
bakery_y_pred_kmeans = bakery_kmeans.fit_predict(data_scaled)

bakery_kmeans.cluster_centers_

array([[-0.27019827, -0.05911589, -0.00563971, -0.05246349, -0.67139829,
        -0.37110241,  0.00174243],
       [ 0.46008072,  0.10065971,  0.00960303,  0.08933232,  1.14322497,
         0.63189549, -0.00296693]])

## Describe centroids


In [87]:

centroids = bakery_kmeans.cluster_centers_

feature_names = data_copy.columns.tolist()

centroid_df = pd.DataFrame(data=centroids, columns=feature_names)
print("Cluster Centroids:")
print(centroid_df)

          total  angbutter  plain bread       jam  americano  croissant  \
0  1.825116e+04   1.138031     0.391892  0.095077   0.188707   0.361486   
1  1.293000e+06   6.000000     5.000000  0.000000   0.000000   5.000000   
2  3.495130e+04   2.492795     0.608069  0.149856   0.351585   0.850144   

   caffe latte  tiramisu croissant  cacao deep  pain au chocolat  ...  \
0     0.082529            0.335907    0.143340          0.258687  ...   
1     0.000000            0.000000    0.000000          5.000000  ...   
2     0.123919            0.717579    0.193084          0.533141  ...   

   hour_cos   min_sin   min_cos  day_of_month_sin  day_of_month_cos  \
0 -0.861576  0.075113  0.043259         -0.012735          0.003295   
1 -0.991000 -0.638000 -0.770000         -0.849000          0.529000   
2 -0.875398  0.111781  0.058625         -0.042285          0.021715   

   day_of_week_nr_sin  day_of_week_nr_cos  week_of_month_sin  \
0           -0.157986            0.168436           0.088

In [88]:
# show the clusters per instance
cluster_labels = bakery_kmeans.labels_
print("Cluster Labels:")
print(cluster_labels)

array([0, 0, 2, ..., 0, 0, 0], dtype=int32)

## Sillhouete study for kmeans

In [None]:
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

# Define the range of clusters to evaluate
range_n_clusters = list(range(2, 11))

# Initialize an empty list to store the silhouette scores
silhouette_scores = []

# Loop over the range of clusters
for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(data_copy)
    silhouette_avg = silhouette_score(data_copy, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.title(f'{file_tag} | Silhouette Scores for KMeans Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# Print the Best K After Silhouette Analysis

best_k = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]
print(f"Optimal number of clusters (Silhouette Score): {best_k}")

# [prof] Plotting clustering solutions



In [None]:
# Set up cluster parameters
plt.figure(figsize=(7, 6))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01)
color_array = ['#377eb8','#ff7f00','#4daf4a','#f781bf','#a65628','#984ea3','#999999','#e41a1c','#dede00']
plot_num = 1
for k in range(len(datasets)):
    predictions = all_predictions[k]
    efficiency = all_efficiency[k]
    X, y = datasets[k][0]
    X = StandardScaler().fit_transform(X)
    
    for name in predictions:
        y_pred = predictions[name]
        plt.subplot(len(datasets), len(algorithms), plot_num)
        if k == 0: plt.title(name, size=10)
        colors = np.array(list(islice(cycle(color_array),int(max(y_pred) + 1))))
        colors = np.append(colors, ["#000000"]) #black color for outliers (if any)
        
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % efficiency[name]).lstrip('0'),
                 transform=plt.gca().transAxes,size=15,horizontalalignment='right')
        plot_num += 1

plt.show()


# Others: spectral, agglomerative, dbscan, model-based


# Cluster Visualization

## 1. Cluster Heatmap (For Feature Importance)
Another way to analyze clustering is by visualizing the centroid values for each feature.

Heatmap of Cluster Centroids: Helps interpret which features are important for each cluster.



In [None]:
import seaborn as sns
import pandas as pd

# Create a dataframe of centroids
centroids = pd.DataFrame(bakery_kmeans.cluster_centers_, columns=data_copy.columns)

# Plot a heatmap of feature values for each cluster
plt.figure(figsize=(12, 6))
sns.heatmap(centroids.T, cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("K-Means Cluster Centroids")
plt.xlabel("Cluster")
plt.ylabel("Feature")
plt.show()


## 2. PCA visualization

In [None]:
from sklearn.decomposition import PCA

# Visualization
pca = PCA(n_components=6)
bakery_pca = pca.fit_transform(data_copy)

pca_bakery_df = pd.DataFrame(bakery_pca, columns=[f'PC{i+1}' for i in range(bakery_pca.shape[1])])
pca_bakery_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
0,2.599017,-0.185011,-0.908678,1.286388,0.514383,0.836274
1,-1.738243,0.26758,-1.241109,-1.036334,-0.167371,-1.384709
2,-0.547021,1.050453,0.877761,-0.585861,0.651541,0.365785
3,-0.657266,0.439276,-0.52511,-0.277721,-1.629132,0.361978
4,-0.94373,0.943847,1.177913,0.45452,1.209342,-1.794241


### PCA  explained variance ratio

In [None]:
# Fit the PCA model
pca.fit(data_copy)

# Explained variance ratio
explained_variance_reorder = pca.explained_variance_ratio_
print(f'Explained variance ratio: {explained_variance_reorder}')


Explained variance ratio: [0.2266113  0.16750973 0.15027046 0.14209286 0.13990889 0.10285501]


### 2D PCA Plot

In [None]:
plt.figure(figsize=(12, 6))

# KMeans Clustering
plt.subplot(121)
plt.scatter(pca_bakery_df['PC1'], pca_bakery_df['PC2'], c=bakery_y_pred_kmeans, cmap='viridis', alpha=0.5)
kmeans_centroids = pca.transform(bakery_kmeans.cluster_centers_)
plt.scatter(kmeans_centroids[:, 0], kmeans_centroids[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('KMeans Clustering')
plt.legend()

# Agglomerative Clustering
plt.subplot(122)
plt.scatter(pca_bakery_df['PC1'], pca_bakery_df['PC2'], c=bakery_y_pred_kmeans, cmap='viridis', alpha=0.5)
# Agglomerative clustering does not have centroids, so we skip this part
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Agglomerative Clustering')

plt.suptitle(f"{file_tag} | PCA Cluster Visualization", fontsize=16)

plt.show()

### 3D PCA Plot

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 6))

# KMeans Clustering
ax = fig.add_subplot(121, projection='3d')
ax.scatter(pca_bakery_df['PC1'], pca_bakery_df['PC2'], pca_bakery_df['PC3'], c=bakery_y_pred_kmeans, cmap='viridis', alpha=0.5)
kmeans_centroids_3d = pca.transform(bakery_kmeans.cluster_centers_)
ax.scatter(kmeans_centroids_3d[:, 0], kmeans_centroids_3d[:, 1], kmeans_centroids_3d[:, 2], c='red', marker='X', s=200, label='Centroids')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('KMeans Clustering')
ax.legend()

# Agglomerative Clustering
ax = fig.add_subplot(122, projection='3d')
ax.scatter(pca_bakery_df['PC1'], pca_bakery_df['PC2'], pca_bakery_df['PC3'], c=bakery_y_pred_kmeans, cmap='viridis', alpha=0.5)
# Agglomerative clustering does not have centroids, so we skip this part
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('Agglomerative Clustering')

fig.suptitle(f"{file_tag} | PCA Cluster Visualization", fontsize=16)

plt.show()