In [1]:
import pandas as pd

# Load the data with explicit encoding
data = pd.read_csv("data.csv", encoding="ISO-8859-1")

# Drop rows with missing values
data.dropna(inplace=True)

# Convert 'InvoiceDate' to datetime format
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# Extracting relevant features for clustering
X = data[['Quantity', 'UnitPrice']]

# Scaling the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Define a function to calculate silhouette score for k-means clustering
def kmeans_silhouette_score(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    return silhouette_avg

# Define a function to calculate silhouette score for hierarchical clustering
def hierarchical_silhouette_score(X, n_clusters):
    hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = hierarchical.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    return silhouette_avg




In [3]:
# Evaluate silhouette score for different number of clusters for k-means clustering
kmeans_scores = []
for n_clusters in range(2, 11):
    score = kmeans_silhouette_score(X_scaled, n_clusters)
    kmeans_scores.append((n_clusters, score))

print("Silhouette scores for k-means clustering:")


Silhouette scores for k-means clustering:


In [4]:
for n_clusters, score in kmeans_scores:
    print(f"Number of clusters = {n_clusters}: Silhouette score = {score}")

# Evaluate silhouette score for different number of clusters for hierarchical clustering
hierarchical_scores = []
for n_clusters in range(2, 11):
    score = hierarchical_silhouette_score(X_scaled, n_clusters)
    hierarchical_scores.append((n_clusters, score))

print("\nSilhouette scores for hierarchical clustering:")


Number of clusters = 2: Silhouette score = 0.9704551084620618
Number of clusters = 3: Silhouette score = 0.9352138557358147
Number of clusters = 4: Silhouette score = 0.8094997818495548
Number of clusters = 5: Silhouette score = 0.6278668553357658
Number of clusters = 6: Silhouette score = 0.5433515387099137
Number of clusters = 7: Silhouette score = 0.5715165449471322
Number of clusters = 8: Silhouette score = 0.5328216899762243
Number of clusters = 9: Silhouette score = 0.5116131488255126
Number of clusters = 10: Silhouette score = 0.45688361625709045

Silhouette scores for hierarchical clustering:


In [5]:
for n_clusters, score in hierarchical_scores:
    print(f"Number of clusters = {n_clusters}: Silhouette score = {score}")

Number of clusters = 2: Silhouette score = 0.9704551084620618
Number of clusters = 3: Silhouette score = 0.9397724938249052
Number of clusters = 4: Silhouette score = 0.8044102379581943
Number of clusters = 5: Silhouette score = 0.6284416351226025
Number of clusters = 6: Silhouette score = 0.6062968538273046
Number of clusters = 7: Silhouette score = 0.6073086018490736
Number of clusters = 8: Silhouette score = 0.5695716894773587
Number of clusters = 9: Silhouette score = 0.42675624225753567
Number of clusters = 10: Silhouette score = 0.4384146139814552


In [6]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

# Define a function to calculate silhouette score for k-means clustering
def kmeans_silhouette_score(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    return silhouette_avg

# Define a function to calculate silhouette score for hierarchical clustering
def hierarchical_silhouette_score(X, n_clusters):
    hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = hierarchical.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    return silhouette_avg

# Evaluate silhouette score for different number of clusters for k-means clustering
kmeans_scores = []
for n_clusters in range(2, 11):
    score = kmeans_silhouette_score(X_scaled, n_clusters)
    kmeans_scores.append((n_clusters, score))

print("Silhouette scores for k-means clustering:")
for n_clusters, score in kmeans_scores:
    print(f"Number of clusters = {n_clusters}: Silhouette score = {score}")

# Evaluate silhouette score for different number of clusters for hierarchical clustering
hierarchical_scores = []
for n_clusters in range(2, 11):
    score = hierarchical_silhouette_score(X_scaled, n_clusters)
    hierarchical_scores.append((n_clusters, score))

print("\nSilhouette scores for hierarchical clustering:")
for n_clusters, score in hierarchical_scores:
    print(f"Number of clusters = {n_clusters}: Silhouette score = {score}")


Silhouette scores for k-means clustering:
Number of clusters = 2: Silhouette score = 0.9704551084620618
Number of clusters = 3: Silhouette score = 0.9352138557358147
Number of clusters = 4: Silhouette score = 0.8094997818495548
Number of clusters = 5: Silhouette score = 0.6278668553357658
Number of clusters = 6: Silhouette score = 0.5433515387099137
Number of clusters = 7: Silhouette score = 0.5715165449471322
Number of clusters = 8: Silhouette score = 0.5328216899762243
Number of clusters = 9: Silhouette score = 0.5116131488255126
Number of clusters = 10: Silhouette score = 0.45688361625709045

Silhouette scores for hierarchical clustering:
Number of clusters = 2: Silhouette score = 0.9704551084620618
Number of clusters = 3: Silhouette score = 0.9397724938249052
Number of clusters = 4: Silhouette score = 0.8044102379581943
Number of clusters = 5: Silhouette score = 0.6284416351226025
Number of clusters = 6: Silhouette score = 0.6062968538273046
Number of clusters = 7: Silhouette score