In [76]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
# Calculate silhouette_score
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [77]:
#Read files
trainfile = r'/gdrive/My Drive/ClusterDataset/Wholesale customers data.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe

print(trainData.shape)
print(trainData.dtypes)
print(trainData.head())


(440, 8)
Channel             int64
Region              int64
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
dtype: object
   Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0        2       3  12669  9656     7561     214              2674        1338
1        2       3   7057  9810     9568    1762              3293        1776
2        2       3   6353  8808     7684    2405              3516        7844
3        1       3  13265  1196     4221    6404               507        1788
4        2       3  22615  5410     7198    3915              1777        5185


In [78]:
X_train=trainData.iloc[:,2:].copy()
print(X_train.shape)
print(X_train.dtypes)
print(X_train.head())

(440, 6)
Fresh               int64
Milk                int64
Grocery             int64
Frozen              int64
Detergents_Paper    int64
Delicassen          int64
dtype: object
   Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0  12669  9656     7561     214              2674        1338
1   7057  9810     9568    1762              3293        1776
2   6353  8808     7684    2405              3516        7844
3  13265  1196     4221    6404               507        1788
4  22615  5410     7198    3915              1777        5185


***Method1: Default K-Means Clustering***

In [79]:
#K-Means Clustering ========================================================================
#Default Mode - K=8
kmeans = KMeans()
kmeans.fit(X_train)

#kmeans.cluster_centers_
#Print Silhouette measure
print(silhouette_score(X_train, kmeans.labels_))
kmeans.labels_



0.3417007664749802


array([1, 1, 1, 0, 0, 4, 4, 1, 4, 1, 1, 4, 0, 0, 0, 4, 1, 4, 0, 4, 0, 4,
       0, 7, 0, 0, 4, 0, 2, 3, 0, 4, 0, 0, 4, 1, 0, 1, 1, 3, 0, 0, 1, 2,
       1, 2, 2, 7, 1, 2, 4, 4, 3, 1, 0, 4, 2, 1, 4, 1, 4, 5, 4, 1, 4, 2,
       4, 0, 4, 4, 0, 0, 4, 0, 1, 0, 4, 2, 4, 4, 4, 1, 1, 0, 4, 5, 7, 3,
       4, 0, 4, 4, 2, 0, 1, 4, 4, 4, 4, 4, 1, 1, 1, 3, 0, 0, 1, 1, 1, 2,
       4, 1, 0, 0, 0, 4, 4, 4, 0, 4, 0, 4, 4, 1, 3, 3, 0, 0, 4, 3, 4, 4,
       0, 4, 4, 4, 1, 1, 0, 4, 0, 0, 3, 4, 0, 2, 4, 4, 4, 0, 0, 4, 0, 4,
       4, 1, 1, 0, 1, 1, 1, 4, 0, 2, 1, 1, 1, 4, 4, 4, 1, 2, 4, 1, 4, 1,
       3, 4, 4, 4, 4, 6, 1, 7, 4, 4, 4, 1, 1, 1, 0, 4, 4, 1, 4, 0, 3, 1,
       4, 4, 2, 2, 0, 4, 4, 2, 4, 4, 4, 1, 0, 2, 4, 1, 1, 1, 2, 0, 1, 4,
       0, 1, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 0, 4, 0, 4, 4, 0, 4, 3, 0, 0,
       0, 4, 1, 1, 4, 4, 0, 4, 4, 2, 4, 0, 1, 0, 4, 4, 3, 3, 4, 4, 0, 4,
       1, 1, 1, 0, 1, 0, 4, 4, 1, 3, 4, 4, 0, 4, 4, 0, 4, 4, 3, 0, 3, 3,
       4, 0, 0, 3, 4, 4, 4, 1, 0, 4, 0, 4, 1, 4, 0,

In [80]:
#Add Cluster Number to each datapoint + save file
df_kmeans=X_train
kmeans_predict_train=kmeans.predict(X_train)

df_kmeans['Cluster Number']=kmeans_predict_train
print(df_kmeans.shape)
df_kmeans.head()
df_kmeans.dtypes

export_csv = df_kmeans.to_csv(r'/gdrive/My Drive/ClusterDataset/Results/WholesaleCustomersResultsA.csv')

(440, 7)


***Method2: K-Means using RandomizedSearch tuning of hyperparameters - best***

In [81]:
#Hyperparameter tuning done for K Means
kmeans = KMeans()
parameters={
    'n_clusters': range(1,4,1),
    'init': ['k-means++', 'random'],
    'max_iter': range(1,800,10),
    'n_init': range(1,30,1),
    'algorithm': ['auto', 'full','elkan']
    }
kmeans = RandomizedSearchCV(kmeans,parameters,n_iter=15)
kmeans.fit(X_train)
grid_parm_kmeans=kmeans.best_params_
print(grid_parm_kmeans)

{'n_init': 22, 'n_clusters': 3, 'max_iter': 671, 'init': 'random', 'algorithm': 'elkan'}


In [82]:
#K-Means Clustering after Hypertuning ========================================================================

kmeans = KMeans(**grid_parm_kmeans)
kmeans.fit(X_train)
kmeans.labels_

print(X_train.shape)

print(silhouette_score(X_train, kmeans.labels_))

(440, 7)
0.4783511375292279


In [83]:
#Add Cluster Number to each datapoint + save file
df_kmeans=X_train
kmeans_predict_train=kmeans.predict(X_train)

df_kmeans['Cluster Number']=kmeans_predict_train
print(df_kmeans.shape)
df_kmeans.head()
df_kmeans.dtypes

export_csv = df_kmeans.to_csv(r'/gdrive/My Drive/ClusterDataset/Results/WholesaleCustomersResultsB-Randomize.csv')

(440, 7)


***Method3: K-Means using manual tuning of hyperparameters - optional***

In [84]:
#K-Means Clustering ========================================================================
#K=3
kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train)

#kmeans.cluster_centers_
#Print Silhouette measure
print(silhouette_score(X_train, kmeans.labels_))
kmeans.labels_



0.4783511435929511


array([1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       2, 0, 2, 1, 1, 1, 0, 2, 1, 1, 1, 2, 1, 1, 2, 1, 0, 2, 2, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 2, 1, 2, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2,
       1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [85]:
#Add Cluster Number to each datapoint + save file
df_kmeans=X_train
kmeans_predict_train=kmeans.predict(X_train)

df_kmeans['Cluster Number']=kmeans_predict_train
print(df_kmeans.shape)
df_kmeans.head()
df_kmeans.dtypes

export_csv = df_kmeans.to_csv(r'/gdrive/My Drive/ClusterDataset/Results/WholesaleCustomersResultsB-Manual1.csv')

(440, 7)


In [86]:
#K-Means Clustering hyper parameter tuning ========================================================================
# K=3

kmeans = KMeans(n_clusters=3, init='k-means++', n_init =10)
kmeans.fit(X_train)
kmeans.labels_
#kmeans.cluster_centers_
print(X_train.shape)
#Print Silhouette measure
print(silhouette_score(X_train, kmeans.labels_))

(440, 7)
0.4770179147393792


In [87]:
#Add Cluster Number to each datapoint + save file
df_kmeans=X_train
kmeans_predict_train=kmeans.predict(X_train)

df_kmeans['Cluster Number']=kmeans_predict_train
print(df_kmeans.shape)
df_kmeans.head()
df_kmeans.dtypes

export_csv = df_kmeans.to_csv(r'/gdrive/My Drive/ClusterDataset/Results/WholesaleCustomersResultsB-Manual2.csv')

(440, 7)
