In [303]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

In [304]:
df = pd.read_csv("customers.csv", header=0) 
df.head()

Unnamed: 0,order,promotion,new,gallery,remove,login
0,0,1,0,0,0,0
1,0,0,1,0,0,1
2,0,0,0,0,0,1
3,1,0,0,1,1,0
4,1,1,1,0,0,1


In [305]:
df.shape

(637, 6)

In [306]:
km = KMeans(n_clusters=3, init='random', n_init=100, random_state=22).fit(df)

In [307]:
'''
first - Number of clusters: 3
second - Proportion (in percent) of total sum of squares that is not explained: 50.6928
third - Within sum of squares in the cluster with highest percentage of customers who remove an article from the cart: 153.57425742574256
fourth - In the smallest cluster, number of customers who visit a page with details about a promotional product: 23	
fifth - Number of observations in the cluster with customers that are “frequent visitors”: 202
'''

#Specifically, pay attention to the “frequent visitors” among the customers. 
#This group comprises persons who spend their free time browsing the internet, 
#paying special attention to the galleries of the presented products.

'\nfirst - Number of clusters: 3\nsecond - Proportion (in percent) of total sum of squares that is not explained: 49.30717714493936\nthird - Within sum of squares in the cluster with highest percentage of customers who remove an article from the cart: 145.8403361344538\nfourth - In the smallest cluster, number of customers who visit a page with details about a promotional product:\t\nfifth - Number of observations in the cluster with customers that are “frequent visitors”: 238\n'

In [230]:
df.reset_index(inplace=True);
df.insert(loc=1, column="Labels", value=km.labels_)

df.set_index(['index', "Labels"], inplace=True) # Set "Country"and "Labels" as index
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,order,promotion,new,gallery,remove,login
index,Labels,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,0,1,0,0,0,0
1,0,0,0,1,0,0,1
2,0,0,0,0,0,0,1
3,2,1,0,0,1,1,0
4,1,1,1,1,0,0,1


In [231]:
for i in np.unique(km.labels_):
    withinss = ((df.iloc[km.labels_== i,:] - km.cluster_centers_[i])**2).values.sum()
    print("withinss of cluster {}: {}".format(i, withinss))

withinss of cluster 0: 140.46700507614213
withinss of cluster 1: 145.8403361344538
withinss of cluster 2: 153.57425742574256


In [308]:
from kmeans_interp.kmeans_feature_imp import KMeansInterp

In [309]:
df

Unnamed: 0,order,promotion,new,gallery,remove,login
0,0,1,0,0,0,0
1,0,0,1,0,0,1
2,0,0,0,0,0,1
3,1,0,0,1,1,0
4,1,1,1,0,0,1
...,...,...,...,...,...,...
632,1,1,0,0,0,0
633,0,0,0,1,1,0
634,1,1,0,0,0,0
635,1,0,0,0,0,0


In [310]:
kmf = KMeansInterp(
	n_clusters=3,
	ordered_feature_names=df.columns.tolist(), 
	feature_importance_method='wcss_min',
).fit(df.values)

In [332]:
print(km.cluster_centers_)

[[0.08121827 0.11675127 0.89847716 0.06598985 0.23350254 0.71573604]
 [0.81512605 0.94117647 0.09243697 0.08823529 0.09243697 0.19747899]
 [0.1980198  0.11881188 0.0990099  0.89108911 0.62871287 0.08415842]]


In [335]:
centroiders = pd.DataFrame({'centers0': km.cluster_centers_[0], 'centers1': km.cluster_centers_[1], 'centers2': km.cluster_centers_[2], 'names': km.feature_names_in_})
centroiders

Unnamed: 0,centers0,centers1,centers2,names
0,0.081218,0.815126,0.19802,order
1,0.116751,0.941176,0.118812,promotion
2,0.898477,0.092437,0.09901,new
3,0.06599,0.088235,0.891089,gallery
4,0.233503,0.092437,0.628713,remove
5,0.715736,0.197479,0.084158,login


In [314]:
# The total sum of square coincide with the total (within) sum of squares for 1 cluster
km.totss = ((df-df.mean())**2).values.sum() # sqeuclidean -> squared euclidean distance
km1 = KMeans(n_clusters=1, init='random', random_state=22).fit(df)

print(km.totss, km1.inertia_)

867.7394034536892 867.7394034536895


In [315]:
print(km.inertia_) # total (within) sum of squares (ss)

439.8815986363386


In [340]:
#(total_ss - km2_ss)/total_ss.
100 - ((km.totss - km.inertia_)/km.totss *100)

50.69282285506064

In [317]:
#number of observtion in each cluster
counts = np.bincount(km.labels_)
print(counts)
#so the smallest cluster is #0

[197 238 202]


In [318]:
df_promo = df.copy()
df_promo.insert(loc=1, column="cluster", value=km.labels_)

In [319]:
df_promo.head()

Unnamed: 0,order,cluster,promotion,new,gallery,remove,login
0,0,1,1,0,0,0,0
1,0,0,0,1,0,0,1
2,0,0,0,0,0,0,1
3,1,2,0,0,1,1,0
4,1,1,1,1,0,0,1


In [331]:
df_promo = df_promo.loc[df_promo['promotion'] == 1]
df_promo = df_promo.loc[df_promo['cluster'] == 0]
df_promo.shape

(23, 7)