# Basket Clustering Analysis

This notebook performs basket clustering, compares K-Means with Agglomerative/DBSCAN/HDBSCAN, evaluates using silhouette/DBI/CH metrics, and analyzes actionability of clusters. It also compares basket-level vs product-level clustering for marketing insights.

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import time
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## Load Transaction Data

In [3]:
# Load cleaned transaction data
df = pd.read_csv('../data/processed/cleaned_uk_data.csv')
print(f"Data shape: {df.shape}")
print(df.head())
print(f"Unique customers: {df['CustomerID'].nunique()}")
print(f"Unique products: {df['StockCode'].nunique()}")
print(f"Unique invoices: {df['InvoiceNo'].nunique()}")

Data shape: (485123, 11)
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

           InvoiceDate  UnitPrice CustomerID         Country  TotalPrice  \
0  2010-12-01 08:26:00       2.55     017850  United Kingdom       15.30   
1  2010-12-01 08:26:00       3.39     017850  United Kingdom       20.34   
2  2010-12-01 08:26:00       2.75     017850  United Kingdom       22.00   
3  2010-12-01 08:26:00       3.39     017850  United Kingdom       20.34   
4  2010-12-01 08:26:00       3.39     017850  United Kingdom       20.34   

   DayOfWeek  HourOfDay  
0          2          8  
1          2          8  
2      

## Preprocessing and Basket Matrix Creation

In [4]:
# Filter top products to reduce dimensionality
top_products = df['StockCode'].value_counts().head(100).index
df_filtered = df[df['StockCode'].isin(top_products)]

# Create basket matrix (binary: customer x product)
basket_matrix = df_filtered.pivot_table(
    index='CustomerID', 
    columns='StockCode', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
).astype(bool).astype(int)  # Binary matrix

print(f"Basket matrix shape: {basket_matrix.shape}")
print(basket_matrix.head())

Basket matrix shape: (3549, 100)
StockCode   20712  20719  20723  20724  20725  20726  20727  20728  20914  \
CustomerID                                                                  
000nan          1      1      1      1      1      1      1      1      1   
012747          0      0      0      0      0      0      0      0      0   
012748          1      1      1      1      1      1      1      1      1   
012749          0      0      0      0      0      0      0      0      1   
012820          1      0      0      0      0      0      0      0      0   

StockCode   21034  ...  84879  84946  84978  84991  85099B  85099C  85099F  \
CustomerID         ...                                                       
000nan          1  ...      1      1      1      1       1       1       1   
012747          0  ...      1      0      0      0       1       1       1   
012748          1  ...      1      1      1      1       1       1       1   
012749          0  ...      1      0 

## Clustering Algorithms

In [6]:
def run_clustering(X, algorithm, params):
    start_time = time.time()
    if algorithm == 'KMeans':
        model = KMeans(n_clusters=params['n_clusters'], random_state=RANDOM_STATE)
    elif algorithm == 'Agglomerative':
        model = AgglomerativeClustering(n_clusters=params['n_clusters'], linkage=params.get('linkage', 'ward'))
    elif algorithm == 'DBSCAN':
        model = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
    
    labels = model.fit_predict(X)
    runtime = time.time() - start_time
    return labels, runtime

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(basket_matrix)

# Run algorithms
algorithms = [
    ('KMeans', {'n_clusters': 3}),
    ('Agglomerative', {'n_clusters': 3, 'linkage': 'ward'}),
    ('DBSCAN', {'eps': 1.5, 'min_samples': 5})
]

results = {}
for algo, params in algorithms:
    labels, runtime = run_clustering(X_scaled, algo, params)
    results[algo] = {'labels': labels, 'runtime': runtime}
    print(f"{algo}: {len(set(labels))} clusters, runtime: {runtime:.2f}s")

KMeans: 3 clusters, runtime: 0.02s
Agglomerative: 3 clusters, runtime: 1.02s


NameError: name 'DBSCAN' is not defined

## Evaluation Metrics

In [None]:
def evaluate_clustering(X, labels):
    if len(set(labels)) <= 1:
        return {'Silhouette': np.nan, 'DBI': np.nan, 'CH': np.nan}
    
    sil = silhouette_score(X, labels)
    dbi = davies_bouldin_score(X, labels)
    ch = calinski_harabasz_score(X, labels)
    return {'Silhouette': sil, 'DBI': dbi, 'CH': ch}

# Evaluate each algorithm
evaluation_results = {}
for algo, res in results.items():
    metrics = evaluate_clustering(X_scaled, res['labels'])
    evaluation_results[algo] = {**metrics, 'Runtime': res['runtime']}

eval_df = pd.DataFrame(evaluation_results).T
print(eval_df)

## Actionability Analysis

In [None]:
def analyze_actionability(basket_matrix, labels, algo_name):
    basket_with_labels = basket_matrix.copy()
    basket_with_labels['cluster'] = labels
    
    # Cluster sizes
    cluster_sizes = basket_with_labels['cluster'].value_counts().sort_index()
    
    # Top products per cluster
    top_products = {}
    for cluster in cluster_sizes.index:
        cluster_data = basket_with_labels[basket_with_labels['cluster'] == cluster]
        product_sums = cluster_data.drop('cluster', axis=1).sum().sort_values(ascending=False)
        top_products[cluster] = product_sums.head(5).index.tolist()
    
    return {
        'cluster_sizes': cluster_sizes,
        'top_products': top_products
    }

# Analyze actionability for each algorithm
actionability = {}
for algo, res in results.items():
    actionability[algo] = analyze_actionability(basket_matrix, res['labels'], algo)

# Print results
for algo, analysis in actionability.items():
    print(f"\n{algo} Actionability:")
    print(f"Cluster sizes: {analysis['cluster_sizes'].to_dict()}")
    for cluster, products in analysis['top_products'].items():
        print(f"Cluster {cluster} top products: {products}")

## Comparison and Visualization

In [None]:
# PCA for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, (algo, res) in enumerate(results.items()):
    ax = axes[i]
    scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=res['labels'], cmap='viridis', alpha=0.6)
    ax.set_title(f'{algo} Clustering')
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    plt.colorbar(scatter, ax=ax)

plt.tight_layout()
plt.show()

# Summary table
summary = eval_df.copy()
summary['Actionability'] = [f"{len(actionability[algo]['cluster_sizes'])} clusters" for algo in summary.index]
print("\nSummary Comparison:")
print(summary)

## Product Clustering for Comparison

In [None]:
# Create product co-occurrence matrix
from sklearn.metrics.pairwise import cosine_similarity

# Transpose basket matrix to get product x customer
product_matrix = basket_matrix.T

# Calculate co-occurrence (dot product of binary matrix)
co_occurrence = product_matrix.dot(product_matrix.T)

# Convert to similarity matrix
product_similarity = cosine_similarity(co_occurrence)

# Cluster products using K-Means
product_kmeans = KMeans(n_clusters=5, random_state=RANDOM_STATE)
product_labels = product_kmeans.fit_predict(product_similarity)

print(f"Product clustering: {len(set(product_labels))} clusters")
print(f"Products per cluster: {pd.Series(product_labels).value_counts().sort_index()}")

# Show example products per cluster
product_clusters = pd.DataFrame({
    'StockCode': product_matrix.index,
    'cluster': product_labels
})

for cluster in sorted(set(product_labels)):
    products_in_cluster = product_clusters[product_clusters['cluster'] == cluster]['StockCode'].head(5).tolist()
    print(f"Cluster {cluster} sample products: {products_in_cluster}")

# Evaluate product clustering
if len(set(product_labels)) > 1:
    prod_sil = silhouette_score(product_similarity, product_labels)
    prod_dbi = davies_bouldin_score(product_similarity, product_labels)
    prod_ch = calinski_harabasz_score(product_similarity, product_labels)
    print(f"Product clustering metrics - Silhouette: {prod_sil:.3f}, DBI: {prod_dbi:.3f}, CH: {prod_ch:.3f}")

## Basket vs Product Clustering Comparison

In [None]:
# Compare basket vs product clustering insights
comparison_data = {
    'Approach': ['Basket Clustering (Customer-level)', 'Product Clustering (Product-level)'],
    'Input': ['Customer purchase patterns (3549 customers x 100 products)', 'Product co-occurrence (100 products x 100 products)'],
    'Clusters': [f"{len(set(results['KMeans']['labels']))} customer clusters", f"{len(set(product_labels))} product clusters"],
    'Silhouette': [f"{eval_df.loc['KMeans', 'Silhouette']:.3f}", f"{prod_sil:.3f}"],
    'Marketing Use Case': ['Customer segmentation, personalized campaigns', 'Product recommendations, bundle creation'],
    'Actionability': ['Direct customer targeting, retention strategies', 'Cross-sell suggestions, shelf optimization'],
    'Advantage': ['Customer-centric insights, easier to action', 'Product discovery, automated recommendations']
}

comparison_df = pd.DataFrame(comparison_data)
print("Basket vs Product Clustering Comparison:")
print(comparison_df.to_string(index=False))

print("\nConclusion:")
print("- Basket clustering provides more actionable customer insights for direct marketing campaigns")
print("- Product clustering helps with product recommendations and merchandising")
print("- For this project, basket clustering aligns better with customer segmentation goals")