# K-Means Clustering Tutorial - Code Companion

This notebook contains all the code used to generate figures and demonstrations for the clustering tutorial.

**GitHub Repository**: [Insert your repository link here]

## References

1. Arthur, D., & Vassilvitskii, S. (2007). k-means++: The advantages of careful seeding. In Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms (pp. 1027-1035).
2. Pedregosa, F., et al. (2011). Scikit-learn: Machine learning in Python. Journal of machine learning research, 12(Oct), 2825-2830.
3. MacQueen, J. (1967). Some methods for classification and analysis of multivariate observations.
4. Lloyd, S. (1982). Least squares quantization in PCM. IEEE transactions on information theory, 28(2), 129-137.

In [None]:
# Setup and imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
np.random.seed(42)

In [None]:
# Generate dataset with overlapping clusters
X, y_true = make_blobs(n_samples=300, centers=4, n_features=2, 
                       cluster_std=1.5, random_state=42)

print(f"Dataset shape: {X.shape}")

In [None]:
# Figure 1: Random initialization variability
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()
colors = ['#0173B2', '#DE8F05', '#029E73', '#CC78BC']

for idx in range(6):
    kmeans = KMeans(n_clusters=4, init='random', n_init=1, random_state=idx)
    labels = kmeans.fit_predict(X)
    
    for cluster in range(4):
        mask = labels == cluster
        axes[idx].scatter(X[mask, 0], X[mask, 1], c=colors[cluster], 
                         s=30, alpha=0.6, edgecolors='black', linewidth=0.3)
    
    axes[idx].scatter(kmeans.cluster_centers_[:, 0], 
                     kmeans.cluster_centers_[:, 1],
                     c='red', s=200, marker='X', edgecolors='black', 
                     linewidth=2, label='Centroids')
    
    axes[idx].set_title(f"Run {idx+1} - Inertia: {kmeans.inertia_:.1f}", fontsize=11)
    axes[idx].set_xlabel('Feature 1')
    axes[idx].set_ylabel('Feature 2')

plt.suptitle('Random Initialization: 6 Different Runs Show Inconsistent Results', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('figure1_random_init.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 2: K-Means++ initialization comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Worst random result
random_results = []
for i in range(20):
    km = KMeans(n_clusters=4, init='random', n_init=1, random_state=i)
    km.fit(X)
    random_results.append((km.inertia_, km.cluster_centers_, km.labels_))
worst_random = max(random_results, key=lambda x: x[0])

for cluster in range(4):
    mask = worst_random[2] == cluster
    axes[0].scatter(X[mask, 0], X[mask, 1], c=colors[cluster], 
                   s=40, alpha=0.6, edgecolors='black', linewidth=0.5)
axes[0].scatter(worst_random[1][:, 0], worst_random[1][:, 1],
               c='red', s=250, marker='X', edgecolors='black', 
               linewidth=2.5, label='Centroids')
axes[0].set_title(f'Random Init (worst case)\nInertia: {worst_random[0]:.1f}', 
                 fontsize=13, fontweight='bold')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].legend()

# K-means++ result
kmeans_pp = KMeans(n_clusters=4, init='k-means++', n_init=10, random_state=42)
labels_pp = kmeans_pp.fit_predict(X)

for cluster in range(4):
    mask = labels_pp == cluster
    axes[1].scatter(X[mask, 0], X[mask, 1], c=colors[cluster], 
                   s=40, alpha=0.6, edgecolors='black', linewidth=0.5)
axes[1].scatter(kmeans_pp.cluster_centers_[:, 0], 
               kmeans_pp.cluster_centers_[:, 1],
               c='red', s=250, marker='X', edgecolors='black', 
               linewidth=2.5, label='Centroids')
axes[1].set_title(f'K-Means++ Init\nInertia: {kmeans_pp.inertia_:.1f}', 
                 fontsize=13, fontweight='bold')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')
axes[1].legend()

plt.tight_layout()
plt.savefig('figure2_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Improvement: {(worst_random[0] - kmeans_pp.inertia_) / worst_random[0] * 100:.1f}%")

In [None]:
# Figure 3: Impact of n_init parameter
n_init_values = [1, 5, 10, 20, 50]
n_trials = 30

random_results_ninit = {}
kmpp_results_ninit = {}

for n_init in n_init_values:
    random_inertias = []
    kmpp_inertias = []
    
    for trial in range(n_trials):
        km_r = KMeans(n_clusters=4, init='random', n_init=n_init, random_state=trial)
        km_r.fit(X)
        random_inertias.append(km_r.inertia_)
        
        km_p = KMeans(n_clusters=4, init='k-means++', n_init=n_init, random_state=trial)
        km_p.fit(X)
        kmpp_inertias.append(km_p.inertia_)
    
    random_results_ninit[n_init] = random_inertias
    kmpp_results_ninit[n_init] = kmpp_inertias

# Create box plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

random_data = [random_results_ninit[n] for n in n_init_values]
kmpp_data = [kmpp_results_ninit[n] for n in n_init_values]

bp1 = axes[0].boxplot(random_data, labels=n_init_values, patch_artist=True)
for patch in bp1['boxes']:
    patch.set_facecolor('#0173B2')
    patch.set_alpha(0.6)
axes[0].set_xlabel('n_init parameter', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Inertia', fontsize=12, fontweight='bold')
axes[0].set_title('Random Initialization', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

bp2 = axes[1].boxplot(kmpp_data, labels=n_init_values, patch_artist=True)
for patch in bp2['boxes']:
    patch.set_facecolor('#DE8F05')
    patch.set_alpha(0.6)
axes[1].set_xlabel('n_init parameter', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Inertia', fontsize=12, fontweight='bold')
axes[1].set_title('K-Means++ Initialization', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.suptitle('Impact of n_init Parameter on Result Consistency', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('figure3_ninit.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Figure 4: Customer segmentation example
np.random.seed(42)

# Create realistic customer segments
seg1_spend = np.random.normal(500, 100, 100)
seg1_freq = np.random.normal(2, 0.5, 100)

seg2_spend = np.random.normal(100, 30, 150)
seg2_freq = np.random.normal(15, 3, 150)

seg3_spend = np.random.normal(250, 50, 200)
seg3_freq = np.random.normal(8, 2, 200)

seg4_spend = np.random.normal(1000, 200, 50)
seg4_freq = np.random.normal(6, 1.5, 50)

customer_spend = np.concatenate([seg1_spend, seg2_spend, seg3_spend, seg4_spend])
customer_freq = np.concatenate([seg1_freq, seg2_freq, seg3_freq, seg4_freq])
X_customers = np.column_stack([customer_spend, customer_freq])

# Scale and cluster
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_customers)

kmeans_customer = KMeans(n_clusters=4, init='k-means++', n_init=10, random_state=42)
labels_customer = kmeans_customer.fit_predict(X_scaled)

# Visualize
fig, ax = plt.subplots(figsize=(10, 7))

for cluster in range(4):
    mask = labels_customer == cluster
    ax.scatter(X_customers[mask, 0], X_customers[mask, 1], 
              c=colors[cluster], s=40, alpha=0.6, 
              edgecolors='black', linewidth=0.3, 
              label=f'Segment {cluster+1}')

centroids = scaler.inverse_transform(kmeans_customer.cluster_centers_)
ax.scatter(centroids[:, 0], centroids[:, 1],
          c='red', s=300, marker='X', edgecolors='black', 
          linewidth=2.5, label='Centroids', zorder=5)

ax.set_xlabel('Monthly Spend ($)', fontsize=13, fontweight='bold')
ax.set_ylabel('Purchase Frequency (visits/month)', fontsize=13, fontweight='bold')
ax.set_title('Customer Segmentation Using K-Means++', fontsize=15, fontweight='bold')
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('figure4_customer_segmentation.png', dpi=300, bbox_inches='tight')
plt.show()

# Print segment characteristics
print("\nSegment Characteristics:")
for i in range(4):
    mask = labels_customer == i
    print(f"\nSegment {i+1}:")
    print(f"  Size: {np.sum(mask)} customers")
    print(f"  Avg Spend: ${centroids[i, 0]:.2f}")
    print(f"  Avg Frequency: {centroids[i, 1]:.2f} visits/month")

In [None]:
# Best practices function
def apply_kmeans_clustering(X, n_clusters=4):
    """
    Apply K-Means clustering with best practices.
    
    Parameters:
    -----------
    X : array-like, shape (n_samples, n_features)
        Input data
    n_clusters : int
        Number of clusters to form
    
    Returns:
    --------
    dict : Dictionary containing model, labels, and metrics
    """
    # Step 1: Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Step 2: Apply K-Means with best practices
    kmeans = KMeans(
        n_clusters=n_clusters,
        init='k-means++',      # Use intelligent initialization
        n_init=10,             # Try 10 different initializations
        max_iter=300,          # Maximum iterations per run
        random_state=42        # For reproducibility
    )
    
    labels = kmeans.fit_predict(X_scaled)
    
    # Step 3: Calculate quality metrics
    silhouette = silhouette_score(X_scaled, labels)
    
    return {
        'model': kmeans,
        'labels': labels,
        'scaler': scaler,
        'inertia': kmeans.inertia_,
        'silhouette': silhouette
    }

# Example usage
results = apply_kmeans_clustering(X_customers, n_clusters=4)
print(f"Silhouette Score: {results['silhouette']:.3f}")
print(f"Inertia: {results['inertia']:.2f}")

## Summary

This notebook demonstrates:
1. The variability of random initialization
2. The consistency improvement with K-Means++
3. The impact of the n_init parameter
4. A practical customer segmentation example
5. Best practices for applying K-Means clustering

All figures are saved as PNG files for use in the tutorial document.