In [1]:
from Data_Operations import*
import numpy as np
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering

import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
class CF_Matrix:

    def __init__(self):

        self.Rows = 0
        self.Cols = 0
        self.Matrix = None

    ## Get quantity of product Prod_id purchased by user Cust_id
    def Get_Quantity(self, Data, UserID_Col, ProductID_Col, Cust_id, Prod_id, Quantity_Col):

        Cust_Id_Filter = Data.loc[Data[UserID_Col]==Cust_id]
        Prod_Id_Filter = Cust_Id_Filter.loc[Cust_Id_Filter[ProductID_Col]==Prod_id]
        Prod_Quantity = np.sum(Prod_Id_Filter[Quantity_Col])
        return Prod_Quantity

    ## Binary_Flag = 1 : Cell contains 1 if product is purchased by user
    ## Binary_Flag = 0 : Cell contains quantity of products is purchased by user
    def Build_User_Product_Matrix(self, Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag):

        Customer_Ids = Data[UserID_Col].unique()
        Product_Ids = Data[ProductID_Col].unique()

        self.Rows = Customer_Ids.shape[0]
        self.Cols = Product_Ids.shape[0]
        self.Matrix = np.zeros((self.Rows, self.Cols))

        for i in range(self.Rows):
            for j in range(self.Cols):

                Cust_Id = Customer_Ids[i]
                Prod_Id = Product_Ids[j]
                Prod_Quantity = self.Get_Quantity(Data, UserID_Col, ProductID_Col, Cust_Id, Prod_Id, Quantity_Col)

                if Prod_Quantity > 0:
                    if Binary_Flag:
                        self.Matrix[i][j] = 1
                    else:
                        self.Matrix[i][j] = Prod_Quantity

def Create_Purchase_Matrix():

    Sales_data = Load_Data(Sales_filepath, Sales_Col_list, seperator=";")
    Products_data = Load_Data(Products_filepath, Products_Col_list, seperator=";")
    User_data = Load_Data(Users_filepath, Users_Col_list, seperator=";")

    ## Analysis of No. of purchase orders per customer.
    ## Instead of whole data, using only N users' data with maximum purchase orders
    Num_Orders_List, Cust_Id_List = VisualizeSalesPerCustomer(Sales_data, UserID_Col)
    Num_Orders_List = Num_Orders_List[:TopN]
    Cust_Id_List = Cust_Id_List[:TopN]
    # Plot_Graph(Cust_Id_List, Num_Orders_List)

    ## Pre-Processing: Combine all data files in one consistent, sort and drop rows with missing value
    Combined_Data = Combine_Data(Sales_data, User_data, Products_data, Cust_Id_List, UserID_Col, ProductID_Col)
    Processed_Data = Data_Processing(Combined_Data)

    ## Create User_Product Matrix for CF
    User_Product_Mat = CF_Matrix()
    User_Product_Mat.Build_User_Product_Matrix(Processed_Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag=1)

    return User_Product_Mat.Matrix, Cust_Id_List, list(Products_data[ProductID_Col])

## Create User_PersonalInformation Matrix based on information filled by users
def Create_User_Matrix():
    return

def Store_Matrix(Data_Matrix, UserID_List, DataFrame_Header, FilePath):



    Matrix_DataFrame = []
    for i in range(len(UserID_List)):
        DF_Row = [UserID_List[i]]
        DF_Row.extend(list(User_Product_Matrix[i]))
        Matrix_DataFrame.append(DF_Row)

    User_Product_Dataframe = pd.DataFrame(Matrix_DataFrame, columns=DataFrame_Header)
    User_Product_Dataframe.to_csv(FilePath)

In [3]:
Data_Folder_Path = "/Users/pranjali/Downloads/SE_Project/Data/SalesDB/"

Sales_filepath = Data_Folder_Path + "sales.csv"
Sales_Col_list = ["SalesID", "CustomerID", "ProductID", "Quantity", "SalesDate"]

Products_filepath = Data_Folder_Path + "products.csv"
Products_Col_list = ["ProductID", "ProductName", "CategoryID", "IsAllergic"]

Users_filepath = Data_Folder_Path + "customers.csv"
Users_Col_list = ["CustomerID", "FirstName", "LastName", "CityID"]

UserID_Col = "CustomerID"
ProductID_Col = "ProductID"
Quantity_Col = "Quantity"

TopN = 300

In [4]:
User_Product_Matrix, UserID_List, ProductID_List = Create_Purchase_Matrix()

Attribute_List = [UserID_Col]
for i in ProductID_List:
    Attribute_List.append(i)

Store_Matrix(User_Product_Matrix, UserID_List, Attribute_List, "./User_Product_Matrix.csv")


Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/sales.csv
number of rows, cols  (6758125, 5)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/products.csv
number of rows, cols  (452, 4)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/customers.csv
number of rows, cols  (98759, 4)


In [5]:
## Dimension Reduction using PCA
def PCA_Transform(Matrix, No_Dimensions):
    
    pca = PCA(n_components=No_Dimensions)
    Reduced_Matrix = pca.fit_transform(Matrix)
    Reduced_Matrix = pd.DataFrame(Reduced_Matrix)
    
    return Reduced_Matrix


## Clustering Algorithms

## 1. KMeans Clustering
def KMeans_Clustering(Matrix, No_Clusters):
    
    KMeans_Obj = KMeans(n_clusters=No_Clusters, init='k-means++').fit(Matrix)
    Cluster_Labels = KMeans_Obj.labels_
    
    return Cluster_Labels

## 2. Hierarchical Clustering
def Hierarchical_Clustering(Matrix, No_Clusters):
    Clustering_Obj = AgglomerativeClustering(n_clusters=No_Clusters, linkage="complete").fit(Matrix)
    Cluster_Labels= Clustering_Obj.labels_
    return Cluster_Labels

## 3. Spectral Clustering
def Spectral_Clustering(Matrix, No_Clusters):
    Clustering_Obj = SpectralClustering(n_clusters=No_Clusters, assign_labels="discretize", random_state=0).fit(Matrix)
    Cluster_Labels = Clustering_Obj.labels_
    
    return Cluster_Labels


## Clustering performance evaluation metrics
def Clustering_Evaluation(Matrix, Cluster_Labels, Dist_Metric='euclidean'):
    
    ## 1. The Silhouette Coefficient
    ## The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. 
    ## Scores around zero indicate overlapping clusters.
    
    Silhouette_Score = metrics.silhouette_score(Matrix, Cluster_Labels, metric=Dist_Metric)
    
    ## 2. The Calinski-Harabasz index (Variance Ratio Criteria)
    ## The index is the ratio of the sum of between-clusters dispersion and of inter-cluster dispersion for all clusters.
    ## The score is higher when clusters are dense and well separated.
    
#     CH_Index = metrics.calinski_harabasz_score(Matrix, Cluster_Labels)
    CH_Index = 0
    
    ## 3. The Davies-Bouldin index
    ## This index signifies the average ‘similarity’ between clusters, where the similarity is a measure 
    ## that compares the distance between clusters with the size of the clusters themselves.
    ## Zero is the lowest possible score. Values closer to zero indicate a better partition.
    
    DB_Index = metrics.davies_bouldin_score(Matrix, Cluster_Labels)
    
    return Silhouette_Score, CH_Index, DB_Index



def Perform_Clustering(Matrix, Clustering_Algo, No_Clusters):
    
    if Clustering_Algo=='KMeans':
        Labels = KMeans_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels, 'euclidean')
                
    elif Clustering_Algo=='Hierarchical':
        Labels = Hierarchical_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)
        
    elif Clustering_Algo=='Spectral':
        Labels = Spectral_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)

    else:
        print("Invalid algotirthm: ", Clustering_Algo)
        
    return (Labels, Evaluation_Metrics)



def Clustering_Comparison(Matrix, Clustering_AlgoList=[], No_Clusters_List=[], PCA_List=[]):
        
    Clustering_Results = []
    
    for dimension in PCA_List:
        if int(dimension) != 0:
            pca_dimension = dimension
            Reduced_Matrix = PCA_Transform(Matrix, pca_dimension)
        else:
            pca_dimension = Matrix.shape[1]
            Reduced_Matrix = Matrix
            
        for algo in Clustering_AlgoList:
            for No_Clusters in No_Clusters_List:
                
                Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, algo, No_Clusters)

                print("---------------------------------------------------------------------------")
                print("Performace for algo: " + str(algo) + ", No. of clusters: "+ str(No_Clusters) + ", PCA with dimensions: " + str(pca_dimension))
                print("Silhouette Coefficient:  ", Evaluation_Metrics[0])
#                 print("Calinski-Harabasz Index: ", Evaluation_Metrics[1])
                print("Davies-Bouldin Index:    ", Evaluation_Metrics[2])
                
                Model_Details = (algo, No_Clusters, pca_dimension)
                Clustering_Results.append((Model_Details, Evaluation_Metrics))
    
#     Sorted_Result = sorted(Clustering_Results, key=lambda x: (-x[1][0], -x[1][1], x[1][2]))
    Sorted_Result = sorted(Clustering_Results, key=lambda x: (x[1][2], -x[1][0]))
    Best_Result = Sorted_Result[0]
    
    Best_Result_Eval = Best_Result[1]
    print("=========================== BEST RESULT ============================")
    print("Silhouette Coefficient:  ", Best_Result_Eval[0])
    print("Calinski-Harabasz Index: ", Best_Result_Eval[1])
    print("Davies-Bouldin Index:    ", Best_Result_Eval[2])
    
    Best_Model_Params = Best_Result[0]
    print("============================ BEST MODEL =============================")
    print("Algorithm:           ", Best_Model_Params[0])
    print("No. of clusters:     ", Best_Model_Params[1])
    print("Dimension Reduction: ", Best_Model_Params[2])
    
    return Best_Model_Params
    
## Store Clustering Algorithm Output
def Store_Clusters(UserID_List, Labels, Output_Folder_Path):
    
    Clusters = {}
    for i in range(len(Labels)):
        
        label = Labels[i]
        UserID = UserID_List[i]
        
        if label not in Clusters.keys():
            Clusters[label] = [UserID]
        else:
            Clusters[label].append(UserID)
            
    with open(Output_Folder_Path + "/Clusters.pkl", 'wb') as output_file:
        pickle.dump(Clusters, output_file)
        
    print(Clusters)
    

In [6]:
Matrix_FilePath = "./User_Product_Matrix.csv"
Col_List=[str(i) for i in range(1,453)]

Clustering_AlgoList = ['KMeans', 'Hierarchical', 'Spectral']
No_Clusters_List = [2, 3, 4, 5, 6, 7]
PCA_List = [0, 100, 200]

In [7]:
Matrix_Data = pd.read_csv(Matrix_FilePath, sep=',')
Matrix = Matrix_Data[Col_List] 

Best_Model = Clustering_Comparison(Matrix, Clustering_AlgoList, No_Clusters_List, PCA_List)

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.003490233897992878
Davies-Bouldin Index:     12.011950589363426
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.0022218174132625774
Davies-Bouldin Index:     10.785616607818788
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   0.0015454596293740601
Davies-Bouldin Index:     9.910661044491473
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.001135727085513463
Davies-Bouldin Index:     8.999637196905686


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.0003195406845067943
Davies-Bouldin Index:     8.413283636190979
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   1.5344175764918158e-05
Davies-Bouldin Index:     7.95652331289516
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.0011359407185026237
Davies-Bouldin Index:     14.875403796315998
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   -5.214068070217316e-07
Davies-Bouldin Index:     12.297001628945347
-----------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   -8.265709706422701e-05
Davies-Bouldin Index:     9.403442099046638
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   -0.000926138954667174
Davies-Bouldin Index:     8.583086527968389
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   -0.0009221089145324087
Davies-Bouldin Index:     8.129393617948116


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   -5.4571492609614554e-05
Davies-Bouldin Index:     3.6655124812020663


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   -0.0037889577760422235
Davies-Bouldin Index:     6.197383884489942


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   -0.005231469596142301
Davies-Bouldin Index:     6.834806529001975


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   -0.00515201551812583
Davies-Bouldin Index:     6.725124048139337


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   -0.005172194011840974
Davies-Bouldin Index:     6.860875747785243


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   -0.005423464791243554
Davies-Bouldin Index:     6.451634164310541
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.007238258716409795
Davies-Bouldin Index:     9.524946705533457
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.006202992754376614
Davies-Bouldin Index:     8.328162107577525
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   0.006551127560321792
Davies-Bouldin Index:     7.68063455411928


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   0.005358406120609569
Davies-Bouldin Index:     7.143029951350114
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   0.0046073578909546185
Davies-Bouldin Index:     6.690617485077394
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   0.003609084546902775
Davies-Bouldin Index:     6.426712246964099
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.0019154729216250702
Davies-Bouldin Index:     10.799334801126133
---------------------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.0034596237845109265
Davies-Bouldin Index:     9.071290902252038


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.0035281484661763883
Davies-Bouldin Index:     8.788009703196574


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   -0.0027243203764879993
Davies-Bouldin Index:     6.820950053104359


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   -0.013372309109885627
Davies-Bouldin Index:     5.770743744438261


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   -0.013882252658496574
Davies-Bouldin Index:     5.424592675832194
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   -0.013985724615269722
Davies-Bouldin Index:     5.526262062265785


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.004019666363877025
Davies-Bouldin Index:     11.415445855706707
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.002944117143649196
Davies-Bouldin Index:     10.206838318955128
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.0022039566265071916
Davies-Bouldin Index:     9.457347513706305
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   0.001250412846845506
Davies-Bouldin Index:     8.686813425790026
---------------------------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   -0.0004294777600699585
Davies-Bouldin Index:     7.747813744621693
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.0012650139182594133
Davies-Bouldin Index:     14.826161349214734
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.00015983037535191927
Davies-Bouldin Index:     12.355605029010135
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.000169701046540236
Davies-Bouldin Index:     10.509473045285636
----------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.0015082607460017174
Davies-Bouldin Index:     4.702340181189367


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   -0.004439084450156521
Davies-Bouldin Index:     5.202118302863046


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   -0.009968115848803312
Davies-Bouldin Index:     4.586739708831622


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   -0.009673168284807692
Davies-Bouldin Index:     5.530931743709589


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   -0.009927515957510817
Davies-Bouldin Index:     5.465676326855244
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   -0.014514553468441764
Davies-Bouldin Index:     4.91812124050763
Silhouette Coefficient:   -5.4571492609614554e-05
Calinski-Harabasz Index:  0
Davies-Bouldin Index:     3.6655124812020663
Algorithm:            Spectral
No. of clusters:      2
Dimension Reduction:  452


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [8]:
if Best_Model[2] != Matrix.shape[1]:
    Reduced_Matrix = PCA_Transform(Matrix, Best_Model[2])
else:
    Reduced_Matrix = Matrix
    
Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, Best_Model[0], Best_Model[1])


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [9]:
UserIDList = list(Matrix_Data['CustomerID'])
Store_Clusters(UserIDList, Labels, "./")

{0: [94800, 30004, 58902, 7923, 5462, 62558, 575, 86705, 28232, 94022, 79076, 37898, 41785, 21584, 69923, 7943, 65983, 6154, 21920, 34788, 1244, 32991, 89193, 95963, 94296, 3740, 12963, 74771, 39927, 54199, 56694, 1158, 55680, 38069, 77632, 35021, 51357, 30496, 46682, 34990, 7908, 54231, 54926, 18854, 83365, 50314, 51201, 47287, 13987, 47162, 65657, 19099, 79057, 17654, 28171, 95972, 10786, 59921, 48186, 60724, 92979, 55558, 45398, 68662, 11576, 92147, 1423, 54500, 14200, 95048, 95157, 90697, 85260, 11615, 53765, 13648, 1477, 84937, 89580, 53802, 56688, 23788, 90263, 72459, 75335, 28974, 9037, 41877, 53849, 95047, 20655, 77904, 23711, 86408, 7235, 62491, 14546, 35646, 75204, 66950, 89192, 82768, 45652, 32804, 33436, 98675, 76784, 83409, 20695, 64743, 66385, 22546, 94660, 61388, 67759, 54956, 19988, 9202, 35771, 49549, 33759, 86479, 37154, 91730, 23083, 28402, 37307, 54402, 12165, 66722, 40880, 69272, 7319, 35501, 97486, 20012, 21188, 44050, 39427, 54588, 34485, 82549, 33566, 60862, 610