# Customer Segmentation & Classification with K-Means and Decision Tree

This notebook performs customer segmentation using K-Means and then trains a Decision Tree classifier to learn the segmentation. The tree is visualized and new customer records can be classified into clusters.

In [None]:
# Imports
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Read the dataset from Lakehouse
try:
    df = pd.read_parquet('/lakehouse/default/Tables/sample_customer_data')
    print('Data loaded with shape:', df.shape)
except Exception as e:
    print('Unable to load data:', e)
    df = None

if df is not None:
    # Encode categorical variables
    df_encoded = pd.get_dummies(df, columns=['Gender','Region'], drop_first=True)
    # Features selection
    features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)'] + [c for c in df_encoded.columns if c.startswith('Gender_') or c.startswith('Region_')]
    print('Features:', features)
    # Scale for K-Means
    scaler = StandardScaler()
    X = df_encoded[features]
    X_scaled = scaler.fit_transform(X)
    # K-Means
    kmeans = KMeans(n_clusters=4, random_state=42)
    df_encoded['Cluster'] = kmeans.fit_predict(X_scaled)
    print('Cluster distribution:', df_encoded['Cluster'].value_counts().sort_index())
else:
    print('Data not available for clustering.')


In [None]:
if df is not None:
    bins = [0,20,30,40,50,60,70,100]
    labels = ['<20','20-30','30-40','40-50','50-60','60-70','70+']
    df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    print('AgeGroup added.')


In [None]:
if df is not None:
    gender_ct = pd.crosstab(df['Gender'], df_encoded['Cluster'])
    region_ct = pd.crosstab(df['Region'], df_encoded['Cluster'])
    age_ct    = pd.crosstab(df['AgeGroup'], df_encoded['Cluster'])
    print('Gender CrosTab:
', gender_ct)
    print('Region CrosTab:
', region_ct)
    print('AgeGroup CrosTab:
', age_ct)
    # Plots
    ax1 = gender_ct.plot(kind='bar', figsize=(8,5))
    ax1.set_title('Gender Distribution Across Clusters')
    ax1.set_xlabel('Gender'); ax1.set_ylabel('Count')
    plt.tight_layout(); plt.show()

    ax2 = region_ct.plot(kind='bar', figsize=(8,5))
    ax2.set_title('Region Distribution Across Clusters')
    ax2.set_xlabel('Region'); ax2.set_ylabel('Count')
    plt.tight_layout(); plt.show()

    ax3 = age_ct.plot(kind='bar', figsize=(8,5))
    ax3.set_title('AgeGroup Distribution Across Clusters')
    ax3.set_xlabel('AgeGroup'); ax3.set_ylabel('Count')
    plt.tight_layout(); plt.show()


In [None]:
if df is not None:
    from sklearn.tree import DecisionTreeClassifier, plot_tree
    # Use the encoded features (unscaled) to train the tree
    X_tree = df_encoded[features]
    y_tree = df_encoded['Cluster']
    dtc = DecisionTreeClassifier(max_depth=4, random_state=42)
    dtc.fit(X_tree, y_tree)
    print('Decision Tree trained.')
    # Plot the tree
    plt.figure(figsize=(20,10))
    plot_tree(dtc, feature_names=features, class_names=[str(c) for c in sorted(df_encoded['Cluster'].unique())], filled=True)
    plt.show()


In [None]:
# Define a new customer record and classify it
if df is not None:
    new_customer = pd.DataFrame([{
        'Age': 28,
        'Annual Income (k$)': 55,
        'Spending Score (1-100)': 60,
        'Gender': 'Male',
        'Region': 'North'
    }])
    # Encode new customer
    new_enc = pd.get_dummies(new_customer, columns=['Gender','Region'], drop_first=True)
    # Reindex to match training features
    new_enc = new_enc.reindex(columns=features, fill_value=0)
    # Predict
    pred = dtc.predict(new_enc)
    print('New customer belongs to cluster:', pred[0])


In [None]:
if df is not None:
    # Save segregated results
    df_encoded.to_csv('/lakehouse/default/Files/segmented_customers.csv', index=False)
    print('Segmented customers saved.')
    # Save plots to files if wanted
