### Member Segmentatoin: K Means Clustering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# load data

In [None]:
df = pd.read_csv('yourfile.csv')

In [None]:
#filter for top 10% based on df column 
top10 = df[df.column > df.column.quantile(0.9)]

#remove categorical columns and remove unknowns, nans, etc. 
top10 = top10.drop([col for col in top10.columns if top10[col].eq('Unknown').any()], axis=1)

#drop columns that don't provide info
top10 = top10.drop(columns=['column_name', 'column_name']

#convert member id to a string with no decimals 
top10['member_id'] = top10['member_id'].astype(str)
top10['member_id'] = top10['member_id'].str[:-2]

#reset index to member id
top10.reset_index().set_index('member_id')

top10

In [None]:
#need numerical column types to run k means
top10 = top10.apply(pd.to_numeric, errors='coerce')
top10.info()

# Set up K Means

In [None]:
scaled = StandardScaler().fit_transform(top10)
#initialize kmeans parameters
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"random_state": 1,
}

#create list to hold SSE values
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled)
    sse.append(kmeans.inertia_)

#visualize
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
# Define the range of clusters
cluster_range = range(2, 7)

# Initialize lists to store cluster numbers and silhouette scores
cluster_numbers = []
silhouette_scores = []

# Iterate through different cluster numbers
for k in cluster_range:
    # Fit the k-means model
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(top10)

    # Predict clusters for each data point
    labels = kmeans.labels_

    # Calculate silhouette score
    silhouette_avg = silhouette_score(top10, labels)

    # Append cluster number and silhouette score to lists
    cluster_numbers.append(k)
    silhouette_scores.append(silhouette_avg)

# Print
for k, score in zip(cluster_numbers, silhouette_scores):
    print(f"Number of Clusters (k): {k}, Silhouette Score: {score}")

# Plot
plt.plot(cluster_numbers, silhouette_scores, marker='o')
plt.title('Silhouette Score for Different K Values')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.show()

# Optimal Clusters 

In [None]:
#set up for optimal clusters
kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(top10)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
cluster_assignments = kmeans.predict(top10)
top10['Cluster'] = cluster_assignments
top10.Cluster.value_counts()

# Cluster Characteristics

In [None]:
#Averages for each cluster
clusters_avg = top10.groupby('Cluster').mean()
clusters_total = top10.groupby('Cluster').sum()
clusters_med = top10.groupby('Cluster').median()
#clusters_avg = clusters_avg.drop('index', axis=1)

In [None]:
#Average for each cluster
clusters_avg = clusters_avg.T
clusters_avg

In [None]:
#Median for each cluster
clusters_med = clusters_med.T
clusters_med

In [None]:
#Total for each cluster
clusters_tot = clusters_total.T
clusters_tot