In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

## Customer Segmentation

Let's read in a new dataset that gives information about spending habits at a mall. We want to learn about the relationships between these customers to see if we can identify common groupings for some future marketing efforts. To be efficient in our efforts we want to better align our communication and offers with those most likely to spend money at the mall.

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/kgoebber/data_151/main/notebooks/Mall_Customers.csv")

print(df.head())

Let's begin by looking at the correlation of our variables, including a binary gender category.

In [None]:
df['Gender_Binary'] = LabelBinarizer().fit_transform(df['Gender'])
df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Gender_Binary']].corr()

It appears that our two main related variables are Age and Spending Score, so let's take a look at the joint distribution of two of our variables to see what it looks like.

In [None]:
plt.figure(figsize=(8, 8))
ax = plt.subplot(111)

ax.scatter(df["Age"], df["Spending Score (1-100)"])

ax.set_title("Clusters Identified by kMeans Model")    
ax.set_ylabel("Spending Score (1-100)")
ax.set_xlabel("Age")

plt.show()

## k-means clustering

Let's use a simple clustering scheme to see if we can identify some common groupings. In order to use k-means clustering we need to know how many clusters we want to find. Looking at our image above, it is not clearly evident how many we should choose and we have no other information that would allow us to analytically choose the number of clusters. So let's make a series of clusters and use the intertia measure (intertia is the sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided) as a way to identify and optimal number of clusters.

In [None]:
X = df[['Age', 'Spending Score (1-100)']]

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(10, 10))
ax = plt.subplot(111)
ax.plot(range(1, 11), wcss)
ax.set_title('Selecting the Numbeer of Clusters using the Elbow Method')
ax.set_xlabel('Clusters')
ax.set_ylabel('WCSS')
plt.show()

From the above graphic we can identify where there are sharp "elbows" indicating a change in the slope of the line. We can idenify the number of clusters to use to look for the last elbow as we increase the number of clusters. For our data, it appears that 4 would be a good number to choose for our segmentation efforts.

Now lets do a k-means cluster with four clusters and plot our data to identify the different clusters by color.

In [None]:
X = df[['Age', 'Spending Score (1-100)']]

kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(X)

cluster_labels = kmeans.predict(X)
X = pd.DataFrame(X)
X['cluster'] = cluster_labels

plt.figure(figsize=(8, 8))
ax = plt.subplot(111)
for k in range(0, 4):
    data = X[X["cluster"]==k]
    ax.scatter(data["Age"], data["Spending Score (1-100)"], label=k)

ax.legend()
ax.set_title("Clusters Identified by kMeans Model")    
ax.set_ylabel("Spending Score (1-100)")
ax.set_xlabel("Age")
plt.show()

How do we interpret these clusters?

## Gaussian Mixture

Let's now do the same thing with a different clusering method. We'll also describe a few characteristics of each cluster to help us interpret our clustering results.

In [None]:
X = df[['Age', 'Spending Score (1-100)']]

n_clusters = 4
gmm_model = GaussianMixture(n_components=n_clusters, random_state=100)
gmm_model.fit(X)

cluster_labels = gmm_model.predict(X)
X = pd.DataFrame(X)
X['cluster'] = cluster_labels

plt.figure(figsize=(8, 8))
ax = plt.subplot(111)

for k in range(0, n_clusters):
    data = X[X["cluster"]==k]
    ax.scatter(data["Age"], data["Spending Score (1-100)"], label=k)

ax.legend()
ax.set_title("Clusters Identified by Guassian Mixture Model")    
ax.set_ylabel("Spending Score (1-100)")
ax.set_xlabel("Age")
plt.show()

for group in np.sort(X.cluster.unique()):
    print(f'Group {group}')
    print(f' Avg. Age: {X.Age[X.cluster == group].mean()}')
    print(f" Avg. Spend Score: {X['Spending Score (1-100)'][X.cluster == group].mean()}")

## Spectral Clustering
Let's compare our previous clustering to using a spectral clustering.

In [None]:
X = df[['Age', 'Spending Score (1-100)']]

n_clusters = 4

spectral_cluster_model= SpectralClustering(
    n_clusters=n_clusters, 
    random_state=25, 
    n_neighbors=8, 
    affinity='nearest_neighbors'
)

X = pd.DataFrame(X)
X['cluster'] = spectral_cluster_model.fit_predict(X[['Age', 'Spending Score (1-100)']])

fig = plt.figure(figsize=(8, 8))
ax = plt.subplot(111)
for k in range(0, n_clusters):
    data = X[X["cluster"]==k]
    ax.scatter(data["Age"], data["Spending Score (1-100)"], label=k)

ax.legend()
ax.set(title='Spectral Clustering')

plt.show()

## Clustering with Classification Prediction
So clusering by itself is useful, but how about combining clustering with a supervised learning algorithm to predict something. Let's go back to our wine data and perform some clustering to identify an unsupervised category for each observation and add that to our prediction scheme.

We can use the elbow method to identify an appropriate number of clusters or in this case we know we have a unique set of categories that could serve as the different number of clusters. First, let's see what the "elbow" analysis gives us!

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')

X = df.drop(columns='quality')

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
plt.figure(figsize=(10, 10))
ax = plt.subplot(111)

ax.plot(range(1, 11), wcss)

ax.set_title('Selecting the Numbeer of Clusters using the Elbow Method')
ax.set_xlabel('Clusters')
ax.set_ylabel('WCSS')

plt.show()

So again, it appears four might be a good number to start with - which is not too dissimilar from the number of categories we have in the dataset (6; 3, 4, 5, 6, 7, 8)

In [None]:
def model_validation(ytrue, y_pred):
    '''Definition for computing and printing a series of Classification metric scores'''
    print(f'Accuracy Score: {metrics.accuracy_score(ytrue, y_pred)}')
    print(f'Precision Score: {metrics.precision_score(ytrue, y_pred, average="macro")}')
    print(f'Recall Score: {metrics.recall_score(ytrue, y_pred, average="macro")}')
    print(f'F1 Score: {metrics.f1_score(ytrue, y_pred, average="macro")}')

Okay, now that we know how many clusters we'll use, let's begin by gathering our best predictor variables, performing the clustering with our feature variables, adding those new clusters to the feature matrix, then doing our prediction with the extra predictors using oversampling with bagging and boosting methods.

In [None]:
X = df[['volatile acidity', 'citric acid', 'sulphates', 'alcohol',
        'chlorides', 'total sulfur dioxide', 'fixed acidity']]
y = df['quality']

n_clusters = len(y.unique())
gmm_model = GaussianMixture(n_components=n_clusters, random_state=100)
gmm_model.fit(X)

X = pd.DataFrame(X)
cluster_labels = gmm_model.predict(X)

# Option 1: simply add the cluster label as a category
# X['cluster'] = cluster_labels

# Option 2: Use one hot encoding to add a feature vector to the DataFrame
ohe = OneHotEncoder(sparse=False)
cluster_ohe = ohe.fit_transform(cluster_labels.reshape(-1, 1))
ohe_df = pd.DataFrame(cluster_ohe, columns=np.unique(cluster_labels))
X.append(ohe_df)

# Split our data into training and testing datasets
xtrain, xtest, ytrain, ytrue = train_test_split(X, y, random_state=2)

# Oversample our features
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(xtrain, ytrain)

# Set parameter for max number of features for Random Forest
m = np.int64(np.sqrt(X.shape[1]))

# Use a Random Forecast and Boosting Method for prediction
adaboost = AdaBoostClassifier(base_estimator=RandomForestClassifier(max_features=m, max_samples=.75, random_state=4),
                              learning_rate=.1, n_estimators=25, random_state=1)

adaboost.fit(X_resampled, y_resampled)

y_pred = adaboost.predict(xtest)

model_validation(ytrue, y_pred)