# Exploring Customer Segmentation

In this activity, you are tasked with profiling customer groups for a large telecommunications company.  The data provided contains information on customers purchasing and useage behavior with the telecom products.  Your goal is to use PCA and clustering to segment these customers into meaningful groups, and report back your findings.  

Because these results need to be interpretable, it is important to keep the number of clusters reasonable.  Think about how you might represent some of the non-numeric features so that they can be included in your segmentation models.  You are to report back your approach and findings to the class.  Be specific about what features were used and how you interpret the resulting clusters.

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import sklearn.cluster as cluster
from sklearn.decomposition import PCA

In [None]:
pd.set_option("display.max_columns", None)

## Data Load and Initial Display

In [None]:
df_in = pd.read_csv("./data/telco_churn_data.csv")

In [None]:
df_in.head()

In [None]:
df_in.info()

In [None]:
df_in.describe()

## Cleanup

### Identify Columns with Many Nulls

If a column is at least 10% null, toss it.  
10% is arbitrary but the point is I don't want to later drop rows using a null criterion and have it erase huge swaths of data based on a few mostly nulled out columns

In [None]:
many_nulls_columns = df_in.loc[
    :, df_in.isnull().sum() / df_in.isnull().count() * 100.0 > 10.0
].columns.to_list()

In [None]:
display(many_nulls_columns)

### Perform the Cleanup

In [None]:
def yes_no_to_bool(series: pd.Series) -> pd.Series:
    """
    Columns that contain only "Yes" and "No" are converted to bool with
    "Yes" -> True
    "No"  -> False
    """
    return series == "Yes" if series.isin(["No", "Yes"]).all() else series


redundant_columns = ["Under 30", "Senior Citizen", "Dependents", "Zip Code"]
df = df_in.drop(columns=many_nulls_columns + redundant_columns).apply(yes_no_to_bool)
assert np.all(df.isnull().sum() == 0), "Some Nulls Remain"
# df.info()
df.head()

## PCA

### Select Numeric Columns

Those columns where data type is not object

In [None]:
df_numeric = df[df.columns[df.dtypes != "object"]]
df_numeric.head()
df_numeric.describe()

### Scale

In [None]:
df_scaled = (df_numeric - df_numeric.mean()) / df_numeric.std()
df_scaled.head()
# df_scaled.describe()

### Cumulative EV vs. Num Components

#### Cumulative EV

In [None]:
cum_ev = (
    PCA(n_components=df_scaled.shape[1]).fit(df_scaled).explained_variance_ratio_
    * 100.0
).cumsum()

#### Plot

In [None]:
fig, ax = plt.subplots()
ax.plot(
    np.arange(len(cum_ev)) + 1,
    cum_ev,
    linestyle="solid",
    marker="o",
    color="black",
)

ax.set_xlabel("Number of Components")
ax.set_ylabel("Cumulative Variance Explained (%)")
ax.grid()


def crosshairs_at(
    target_cev: float = 0.0, ncomp: int = None, color: str = "", linestyle: str = "--"
):
    if ncomp is None:
        ncomp = PCA(n_components=target_cev / 100.0).fit(df_scaled).n_components_

    label = "%2d Components -> %.2f%% Variance" % (ncomp, cum_ev[ncomp - 1])
    ax.axhline(cum_ev[ncomp - 1], color=color, linestyle=linestyle)
    ax.axvline(ncomp, label=label, color=color, linestyle=linestyle)


crosshairs_at(ncomp=2, color="red")
crosshairs_at(ncomp=3, color="cyan")
crosshairs_at(ncomp=4, color="magenta")
crosshairs_at(target_cev=95.0, color="blue")

plt.setp(plt.legend(loc="center right", fancybox=True).texts, family="monospace")

### Fit

In [None]:
pca = PCA(n_components=3)
X = pca.fit_transform(df_scaled)
X.shape

### DataFrame of Fit

In [None]:
df_pca = pd.DataFrame(
    X, columns=["Component" + str(k + 1) for k in range(pca.n_components_)]
)
df_pca.head()

In [None]:
df_pca_with_labels = df_pca.copy(deep=True)

## Clustering with KMeans

### Cluster

In [291]:
kmeans = cluster.KMeans(
    n_clusters=5, random_state=123, init="k-means++", verbose=True
).fit(X)

(unique_labels, counts) = np.unique(kmeans.labels_, return_counts=True)
display([unique_labels, counts])

Initialization complete
Iteration 0, inertia 32912.18794994782.
Iteration 1, inertia 27728.826558480818.
Iteration 2, inertia 26986.026562635227.
Iteration 3, inertia 26778.855979383556.
Iteration 4, inertia 26712.906339016154.
Iteration 5, inertia 26649.32367774002.
Iteration 6, inertia 26581.333189333636.
Iteration 7, inertia 26516.181538993143.
Iteration 8, inertia 26459.282415634894.
Iteration 9, inertia 26417.47382302636.
Iteration 10, inertia 26378.81255553385.
Iteration 11, inertia 26342.90559399892.
Iteration 12, inertia 26300.904046242846.
Iteration 13, inertia 26265.38434033031.
Iteration 14, inertia 26231.41812621745.
Iteration 15, inertia 26207.29210820941.
Iteration 16, inertia 26191.507518575745.
Iteration 17, inertia 26179.911516173837.
Iteration 18, inertia 26169.59217112884.
Iteration 19, inertia 26160.82246699139.
Iteration 20, inertia 26150.785650657337.
Iteration 21, inertia 26139.0893888953.
Iteration 22, inertia 26129.016027885016.
Iteration 23, inertia 26121.7998

[array([0, 1, 2, 3, 4], dtype=int32), array([1311, 1030, 1588, 1488, 1626])]

### Add Labels to DataFrame

In [292]:
df_pca_with_labels["KMeans Label"] = kmeans.labels_
df_pca_with_labels.head()

Unnamed: 0,Component1,Component2,Component3,KMeans Label
0,-2.015967,-2.99077,0.510142,4
1,-0.538588,-0.825186,-2.79156,0
2,0.779218,-2.669573,-0.420855,0
3,1.401822,-1.000172,-1.044029,0
4,-0.653874,-0.143936,-1.216247,4


### Scatter Plot

In [293]:
fig = px.scatter_3d(
    data_frame=df_pca_with_labels,
    x="Component1",
    y="Component2",
    z="Component3",
    color="KMeans Label",
)

In [294]:
fig.update_layout(autosize=False, width=1200, height=800)
fig.show()

## Clustering with DBSCAN

### Parameter Search

DBSCAN is very sensitive to eps and min samples, so I want to search over a range of possible values to find a pair of (eps, min samples) where there is an attractive clustering solution.  

Attractive properties are
- Not too many nulls
- Not too many trivial clusters (those with tiny number of samples)
- Not too many clusters

#### Define Search Space

In [334]:
eps_list = np.linspace(0.1, 2.0, 20, endpoint=True)
min_samples_list = np.arange(10, 100 + 10, 10)
display([eps_list, min_samples_list])

[array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
        1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. ]),
 array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100])]

#### Perform the Search

In [335]:
for eps in eps_list:
    for min_samples in min_samples_list:
        dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples).fit(X)
        (unique_labels, counts) = np.unique(dbscan.labels_, return_counts=True)
        null_count = counts[unique_labels == -1][0] if -1 in unique_labels else 0
        null_pct = null_count / len(X) * 100.0
        non_null_pct = counts[unique_labels != -1] / len(X) * 100.0
        num_labels = np.sum(unique_labels != -1)
        if num_labels in [3, 4, 5] and null_pct < 10.0:
            msg = (
                "eps = %.2f, min samples = %d, nulls = %.2f%%, num labels = %d, label distr = %s"
                % (
                    eps,
                    min_samples,
                    null_pct,
                    num_labels,
                    str(np.round(non_null_pct, 1)),
                )
            )
            display(msg)

'eps = 0.60, min samples = 10, nulls = 6.50%, num labels = 4, label distr = [92.7  0.2  0.4  0.2]'

'eps = 0.70, min samples = 10, nulls = 2.61%, num labels = 3, label distr = [97.2  0.1  0.1]'

### Cluster

In [300]:
dbscan = cluster.DBSCAN(eps=0.5, min_samples=25).fit(X)
(unique_labels, counts) = np.unique(dbscan.labels_, return_counts=True)
display([unique_labels, counts])

[array([-1,  0,  1,  2,  3,  4]), array([3414, 2008, 1322,   80,  203,   16])]

### Add Labels to DataFrame

In [None]:
df_pca_with_labels["DBSCAN Label"] = dbscan.labels_
df_pca_with_labels.head()

### Scatter Plot

In [None]:
fig = px.scatter_3d(
    data_frame=df_pca_with_labels,
    x="Component1",
    y="Component2",
    z="Component3",
    color="DBSCAN Label",
)

In [None]:
fig.update_layout(autosize=False, width=1200, height=800)
fig.show()