# Customer Segmentation for E-commerce Personalization

This notebook implements a complete customer segmentation workflow to support personalized marketing. It covers data loading, EDA, preprocessing, feature engineering (including RFM/behavioral proxies), clustering experiments (KMeans, Agglomerative, DBSCAN), evaluation, visualization, and export of segment definitions and artifacts.

Objectives:
- Build and evaluate clustering models to discover meaningful customer segments.
- Describe segments and generate targeted marketing recommendations.
- Produce reproducible artifacts (pipeline + model + labeled dataset).

In [None]:
# Section 1 - Imports and environment check
import sys
import os
import platform
from datetime import datetime

# Data science libs (optional if not installed)
import importlib

REQUIRED = [
    'pandas', 'numpy', 'sklearn', 'matplotlib', 'seaborn', 'joblib'
]

print('Python', sys.version)
print('Platform', platform.platform())

for pkg in REQUIRED:
    try:
        m = importlib.import_module(pkg)
        print(pkg, 'version', getattr(m, '__version__', 'unknown'))
    except Exception as e:
        print(pkg, 'NOT INSTALLED - you can install with pip')

# reproducibility
RANDOM_STATE = 42

# plotting defaults
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns
sns.set_context('notebook')


In [None]:
# Section 2 - Load dataset (data.csv)
import pandas as pd

DATA_PATH = os.path.join(os.path.dirname(os.path.abspath('__file__')), 'data.csv')
print('Looking for data at', DATA_PATH)

# Read with pandas, try to handle whitespace header quirks
raw = pd.read_csv(DATA_PATH, encoding='utf-8', engine='python')
# Clean column names
raw.columns = [c.strip() for c in raw.columns]

print('Shape:', raw.shape)
raw.head()


In [None]:
# Section 3 - Quick data audit and EDA

# Basic info
print('Columns and dtypes:')
print(raw.dtypes)

print('\nMissing values per column:')
print(raw.isnull().sum())

# Basic numeric describe
num_cols = raw.select_dtypes(include=['number']).columns.tolist()
print('\nNumeric columns:', num_cols)

if len(num_cols) > 0:
    display(raw[num_cols].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]).T)

# Visual quick checks: histograms for numeric columns
import matplotlib.pyplot as plt
for c in num_cols:
    plt.figure(figsize=(6,3))
    sns.histplot(raw[c].dropna(), kde=True)
    plt.title(c)
    plt.show()


In [None]:
# Section 4 - Data cleaning (missing values, types)

# Example cleaning strategies (customize as you inspect data):
# - Trim whitespace in string columns
# - Replace obvious placeholders (e.g., empty strings) with NaN
# - Decide to drop columns that are not useful (Address may be dropped)

str_cols = raw.select_dtypes(include=['object']).columns.tolist()
for c in str_cols:
    raw[c] = raw[c].astype(str).str.strip()

# Example: drop Address column (often not relevant for segmentation)
if 'Address' in raw.columns:
    print('Dropping Address column to focus on behavioral features')
    raw = raw.drop(columns=['Address'])

# Quick check after cleaning
raw.head()


In [None]:
# Section 5 - Outlier detection and handling
from scipy import stats

# IQR-based outlier detection example for numeric columns
def detect_outliers_iqr(df, cols):
    outlier_idx = set()
    for c in cols:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        idx = df[(df[c] < lower) | (df[c] > upper)].index
        outlier_idx.update(idx.tolist())
    return sorted(list(outlier_idx))

numeric_cols = raw.select_dtypes(include=['number']).columns.tolist()
outliers = detect_outliers_iqr(raw, numeric_cols)
print('Detected', len(outliers), 'outlier rows (IQR method)')

# Strategy options (choose one):
# 1. Clip values to percentile bounds
# # raw[numeric_cols] = raw[numeric_cols].clip(lower=raw[numeric_cols].quantile(0.01), upper=raw[numeric_cols].quantile(0.99), axis=1)
# 2. Winsorize using scipy or manual clipping
# 3. Remove outlier rows before clustering (careful - loses data)

# Example: show boxplot before/after for a chosen column
if len(numeric_cols) > 0:
    c = numeric_cols[0]
    plt.figure(figsize=(8,3))
    sns.boxplot(x=raw[c])
    plt.title(f'Boxplot - {c}')
    plt.show()


In [None]:
# Section 6 - Feature engineering (RFM-like & behavioral proxies)
# Note: original dataset lacks transaction-level dates; we will use available proxies:
# - Length of Membership (proxy for tenure)
# - Time on App, Time on Website (engagement)
# - Yearly Amount Spent (monetary)

feat_df = raw.copy()

# Convert numeric columns if needed
for c in ['Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent']:
    if c in feat_df.columns:
        feat_df[c] = pd.to_numeric(feat_df[c], errors='coerce')

# Example derived features
if 'Time on App' in feat_df.columns and 'Time on Website' in feat_df.columns:
    feat_df['App_vs_Web_ratio'] = feat_df['Time on App'] / (feat_df['Time on Website'] + 1e-6)

# Example: normalize yearly spend by membership length as avg annual spend per membership-year
if 'Yearly Amount Spent' in feat_df.columns and 'Length of Membership' in feat_df.columns:
    feat_df['Spend_per_membership_year'] = feat_df['Yearly Amount Spent'] / (feat_df['Length of Membership'] + 1e-6)

# Select features for clustering
candidate_features = [c for c in feat_df.columns if c in ['Time on App','Time on Website','Length of Membership','Yearly Amount Spent','App_vs_Web_ratio','Spend_per_membership_year']]
print('Candidate features:', candidate_features)

X = feat_df[candidate_features].copy()
X.head()


In [None]:
# Section 7 - Encoding categorical features
# Avatar may be a color string; low-cardinality frequency encoding is often useful
if 'Avatar' in feat_df.columns:
    freq = feat_df['Avatar'].value_counts(normalize=True)
    feat_df['Avatar_freq'] = feat_df['Avatar'].map(freq)
    candidate_features.append('Avatar_freq')

# If there are other categorical columns, consider one-hot or target encoding
print('Updated candidate features:', candidate_features)
X = feat_df[candidate_features].copy()
X.head()


In [None]:
# Section 8 - Feature scaling and pipeline construction
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler

# Build a simple pipeline for numeric features
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

X_prepared = numeric_pipeline.fit_transform(X)
print('Prepared feature matrix shape:', X_prepared.shape)


In [None]:
# Section 9 - Dimensionality reduction (PCA + UMAP)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9, random_state=RANDOM_STATE)  # keep 90% variance
X_pca = pca.fit_transform(X_prepared)
print('PCA components retained:', pca.n_components_)

# For visualization get 2D embedding via PCA and optionally UMAP
import numpy as np
X_pca_2d = PCA(n_components=2, random_state=RANDOM_STATE).fit_transform(X_prepared)
plt.figure(figsize=(6,5))
plt.scatter(X_pca_2d[:,0], X_pca_2d[:,1], s=20, alpha=0.6)
plt.title('2D PCA projection of customers (unclustered)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

# UMAP (optional) - requires umap-learn
try:
    import umap
    reducer = umap.UMAP(random_state=RANDOM_STATE)
    X_umap = reducer.fit_transform(X_prepared)
    plt.figure(figsize=(6,5))
    plt.scatter(X_umap[:,0], X_umap[:,1], s=20, alpha=0.6)
    plt.title('UMAP 2D embedding (unclustered)')
    plt.show()
except Exception as e:
    print('UMAP not available:', e)


In [None]:
# Section 10 - Baseline clustering with KMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

results = []
K_RANGE = range(2,11)
for k in K_RANGE:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X_prepared)
    sil = silhouette_score(X_prepared, labels) if len(set(labels))>1 else float('nan')
    db = davies_bouldin_score(X_prepared, labels) if len(set(labels))>1 else float('nan')
    ch = calinski_harabasz_score(X_prepared, labels) if len(set(labels))>1 else float('nan')
    results.append({'k':k, 'silhouette':sil, 'davies_bouldin':db, 'calinski_harabasz':ch})

results_df = pd.DataFrame(results)
results_df

# Plot silhouette vs k
plt.figure(figsize=(6,3))
plt.plot(results_df['k'], results_df['silhouette'], marker='o')
plt.xlabel('k (clusters)')
plt.ylabel('Silhouette score')
plt.title('KMeans silhouette vs k')
plt.show()

# Choose best k by silhouette
best_k = int(results_df.loc[results_df['silhouette'].idxmax(), 'k'])
print('Best k by silhouette:', best_k)

best_km = KMeans(n_clusters=best_k, random_state=RANDOM_STATE, n_init=10).fit(X_prepared)
labels_best = best_km.labels_
feat_df['cluster_kmeans'] = labels_best


In [None]:
# Section 11 - Alternative clustering: GMM, Agglomerative, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering, DBSCAN

# Gaussian Mixture
gmm = GaussianMixture(n_components=best_k, random_state=RANDOM_STATE)
gmm_labels = gmm.fit_predict(X_prepared)
print('GMM silhouette', silhouette_score(X_prepared, gmm_labels))

# Agglomerative
agg = AgglomerativeClustering(n_clusters=best_k)
agg_labels = agg.fit_predict(X_prepared)
print('Agg silhouette', silhouette_score(X_prepared, agg_labels))

# DBSCAN (baseline parameters)
dbscan = DBSCAN(eps=0.8, min_samples=5)
db_labels = dbscan.fit_predict(X_prepared)
unique_db = len(set(db_labels) - {-1})
print('DBSCAN clusters (excluding noise):', unique_db)


In [None]:
# Section 12 - Cluster selection & evaluation
# Consolidate metrics for chosen algorithms
from collections import OrderedDict

cluster_eval = OrderedDict()
cluster_eval['kmeans'] = {
    'labels': feat_df['cluster_kmeans'],
    'silhouette': silhouette_score(X_prepared, feat_df['cluster_kmeans'])
}
cluster_eval['gmm'] = {'labels': gmm_labels, 'silhouette': silhouette_score(X_prepared, gmm_labels)}
cluster_eval['agglomerative'] = {'labels': agg_labels, 'silhouette': silhouette_score(X_prepared, agg_labels)}

for name, vals in cluster_eval.items():
    print(name, 'silhouette:', vals['silhouette'])

# If ground-truth labels exist (rare), compute ARI/NMI here
# from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score


In [None]:
# Section 13 - Cluster profiling and top-3 characteristics per segment

def cluster_profile(df, features, label_col='cluster_kmeans'):
    profiles = {}
    global_means = df[features].mean()
    for label in sorted(df[label_col].unique()):
        member = df[df[label_col]==label]
        means = member[features].mean()
        diff = (means - global_means) / (global_means.replace(0, 1))
        top3 = diff.abs().sort_values(ascending=False).head(3).index.tolist()
        profiles[label] = {
            'size': len(member),
            'means': means.to_dict(),
            'top3_features': top3
        }
    return profiles

features = candidate_features
profiles = cluster_profile(feat_df, features, label_col='cluster_kmeans')

import json
print(json.dumps(profiles, indent=2)[:1000])


In [None]:
# Section 14 - Segment visualization (scatter, heatmap, radar)
# 2D scatter using PCA/UMAP colored by cluster
if 'cluster_kmeans' in feat_df.columns:
    lbls = feat_df['cluster_kmeans']
    plt.figure(figsize=(7,5))
    scatter = plt.scatter(X_pca_2d[:,0], X_pca_2d[:,1], c=lbls, cmap='tab10', s=20)
    plt.legend(*scatter.legend_elements(), title='cluster')
    plt.title('PCA 2D colored by KMeans cluster')
    plt.show()

# Cluster centroid heatmap
centroids = feat_df.groupby('cluster_kmeans')[features].mean()
plt.figure(figsize=(8,4))
sns.heatmap((centroids - centroids.mean())/centroids.std(), annot=True, cmap='coolwarm')
plt.title('Cluster centroids (standardized)')
plt.show()


In [None]:
# Section 15 - Personalized marketing rules and recommendations per segment
# Example rule generation based on top features
marketing_rules = {}
for label, p in profiles.items():
    top = p['top3_features']
    size = p['size']
    rule = {
        'segment': int(label),
        'size': int(size),
        'top_features': top,
        'recommended_action': 'Use targeted email campaigns focusing on product bundles for high spenders' if 'Yearly Amount Spent' in top else 'Promote app features for high app usage users'
    }
    marketing_rules[label] = rule

import json
print('Example marketing rules for segments:')
print(json.dumps(marketing_rules, indent=2))


In [None]:
# Section 16 - Save results, export labeled dataset, and reproducibility
import joblib

# Save labeled dataset
out_path = os.path.join(os.path.dirname(DATA_PATH), 'customer_segments.csv')
feat_df.to_csv(out_path, index=False)
print('Saved labeled customers to', out_path)

# Save preprocessing pipeline and clustering model
joblib.dump(numeric_pipeline, os.path.join(os.path.dirname(DATA_PATH), 'preprocessing_pipeline.joblib'))
joblib.dump(best_km, os.path.join(os.path.dirname(DATA_PATH), 'kmeans_model.joblib'))
print('Saved pipeline and model artifacts')

# Save small manifest
manifest = {
    'created_at': datetime.utcnow().isoformat(),
    'rows': int(feat_df.shape[0]),
    'features': candidate_features,
    'random_state': RANDOM_STATE
}
import json
with open(os.path.join(os.path.dirname(DATA_PATH),'manifest.json'),'w',encoding='utf-8') as f:
    json.dump(manifest, f, indent=2)
print('Wrote manifest.json')


In [None]:
# Section 17 - Automated checks and lightweight tests
# Minimal checks (not full pytest) to ensure preprocessing output shape and no NaNs
print('Prepared shape:', getattr(X_prepared, 'shape', None))
assert X_prepared is not None, 'X_prepared missing'
import numpy as np
assert not np.isnan(X_prepared).any(), 'NaNs present after preprocessing - review imputation'

print('Smoke checks passed. If you want, I can add pytest-based tests in tests/test_preprocessing.py')


In [None]:
# Section 15 (expanded) - Personalized marketing rules, uplift simulation, and export
# We already created simple marketing_rules above. We'll expand with a light simulation and save recommendations.

# Example: assume baseline conversion rate per cluster (these should come from real analytics; placeholders here)
# If you have a real `conversion_rate` column, replace this with actual figures.
baseline_conversion = {}
for label, p in profiles.items():
    # default placeholder: lower-spend clusters may have 1-2% baseline, mid-spend 2-4%, high-spend 4-8%
    size = p['size']
    # heuristic based on mean Yearly Amount Spent if available
    mean_spend = p['means'].get('Yearly Amount Spent', 0)
    if mean_spend >= 600:
        baseline_conversion[label] = 0.06
    elif mean_spend >= 450:
        baseline_conversion[label] = 0.035
    elif mean_spend >= 300:
        baseline_conversion[label] = 0.02
    else:
        baseline_conversion[label] = 0.01

# Define recommended action per segment more explicitly
marketing_recommendations = {}
for label, p in profiles.items():
    top = p['top3_features']
    rec = {}
    # Channel and message tone
    if 'Time on App' in top or 'App_vs_Web_ratio' in top:
        rec['channel'] = 'Push notifications + in-app banners'
        rec['message'] = 'Highlight app-exclusive deals and frictionless checkout'
    elif 'Time on Website' in top:
        rec['channel'] = 'Email + website personalization'
        rec['message'] = 'Show tailored product carousels and limited-time discounts'
    elif 'Yearly Amount Spent' in top or 'Spend_per_membership_year' in top:
        rec['channel'] = 'Personalized email + premium offers'
        rec['message'] = 'VIP bundles, loyalty offers, early access'
    else:
        rec['channel'] = 'Email + social ads'
        rec['message'] = 'Promotional discounts and product discovery content'

    # Offer type
    if p['means'].get('Yearly Amount Spent', 0) >= 600:
        rec['offer'] = 'Premium bundles, cross-sell high-margin items, loyalty program invite'
    elif p['means'].get('Yearly Amount Spent', 0) >= 400:
        rec['offer'] = 'Sitewide % discount or free shipping threshold'
    else:
        rec['offer'] = 'Intro discount (10-15%) and product recommendations'

    # Expected uplift (conservative estimate). These are placeholders for planning A/B tests.
    # We'll compute expected conversion after targeted campaign as baseline * (1 + uplift_pct)
    uplift_pct = 0.15  # 15% uplift target; per-cluster tuning may change this
    rec['baseline_conversion'] = baseline_conversion[label]
    rec['expected_conversion'] = rec['baseline_conversion'] * (1 + uplift_pct)
    rec['expected_absolute_lift'] = rec['expected_conversion'] - rec['baseline_conversion']
    rec['size'] = p['size']
    rec['top_features'] = top
    rec['recommended_action'] = marketing_rules.get(label, {}).get('recommended_action', '')

    marketing_recommendations[label] = rec

# Save recommendations and a short human-readable report
import json
rec_path = os.path.join(os.path.dirname(DATA_PATH), 'marketing_recommendations.json')
with open(rec_path, 'w', encoding='utf-8') as f:
    json.dump(marketing_recommendations, f, indent=2)
print('Wrote marketing recommendations to', rec_path)

# Create a short markdown report summarizing segments
report_lines = []
report_lines.append('# Customer Segmentation Report')
report_lines.append(f'Generated: {datetime.utcnow().isoformat()} UTC')
report_lines.append('\n## Segment summaries')
for label, p in profiles.items():
    rpt = marketing_recommendations[label]
    report_lines.append(f"\n### Segment {label} â€” size: {p['size']}")
    report_lines.append(f"Top-3 features: {', '.join(p['top3_features'])}")
    report_lines.append(f"Mean metrics: ")
    for feat, val in p['means'].items():
        report_lines.append(f"- {feat}: {val:.2f}")
    report_lines.append(f"Recommended channel: {rpt['channel']}")
    report_lines.append(f"Suggested message: {rpt['message']}")
    report_lines.append(f"Suggested offer: {rpt['offer']}")
    report_lines.append(f"Baseline conv: {rpt['baseline_conversion']:.3f}, expected conv: {rpt['expected_conversion']:.3f} (abs lift {rpt['expected_absolute_lift']:.3f})")

report_path = os.path.join(os.path.dirname(DATA_PATH), 'segment_report.md')
with open(report_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(report_lines))
print('Wrote human-readable report to', report_path)


In [None]:
# Section 18 - Save cluster profiles (JSON) and final sanity checks
profiles_path = os.path.join(os.path.dirname(DATA_PATH), 'cluster_profiles.json')
with open(profiles_path, 'w', encoding='utf-8') as f:
    json.dump(profiles, f, indent=2)
print('Saved cluster profile summary to', profiles_path)

# Final sanity: ensure artifact files exist
for p in [out_path, rec_path, report_path, profiles_path, os.path.join(os.path.dirname(DATA_PATH),'manifest.json')]:
    print(p, 'exists?', os.path.exists(p))

print('\nAll notebook steps complete. Next: review the generated files, run targeted A/B tests for top segments, and iterate on features.')
