In [1]:
from Data_Operations import*
import numpy as np
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import LinearSVC
from sklearn import metrics

In [2]:
class CF_Matrix:

    def __init__(self):

        self.Rows = 0
        self.Cols = 0
        self.Matrix = None

    ## Get quantity of product Prod_id purchased by user Cust_id
    def Get_Quantity(self, Data, UserID_Col, ProductID_Col, Cust_id, Prod_id, Quantity_Col):

        Cust_Id_Filter = Data.loc[Data[UserID_Col]==Cust_id]
        Prod_Id_Filter = Cust_Id_Filter.loc[Cust_Id_Filter[ProductID_Col]==Prod_id]
        Prod_Quantity = np.sum(Prod_Id_Filter[Quantity_Col])
        return Prod_Quantity

    ## Binary_Flag = 1 : Cell contains 1 if product is purchased by user
    ## Binary_Flag = 0 : Cell contains quantity of products is purchased by user
    def Build_User_Product_Matrix(self, Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag):

        Customer_Ids = Data[UserID_Col].unique()
        Product_Ids = Data[ProductID_Col].unique()

        self.Rows = Customer_Ids.shape[0]
        self.Cols = Product_Ids.shape[0]
        self.Matrix = np.zeros((self.Rows, self.Cols))

        for i in range(self.Rows):
            for j in range(self.Cols):

                Cust_Id = Customer_Ids[i]
                Prod_Id = Product_Ids[j]
                Prod_Quantity = self.Get_Quantity(Data, UserID_Col, ProductID_Col, Cust_Id, Prod_Id, Quantity_Col)

                if Prod_Quantity > 0:
                    if Binary_Flag:
                        self.Matrix[i][j] = 1
                    else:
                        self.Matrix[i][j] = Prod_Quantity

def Create_Purchase_Matrix():

    Sales_data = Load_Data(Sales_filepath, Sales_Col_list, seperator=";")
    Products_data = Load_Data(Products_filepath, Products_Col_list, seperator=";")
    User_data = Load_Data(Users_filepath, Users_Col_list, seperator=";")

    ## Analysis of No. of purchase orders per customer.
    ## Instead of whole data, using only N users' data with maximum purchase orders
    Num_Orders_List, Cust_Id_List = VisualizeSalesPerCustomer(Sales_data, UserID_Col)
    Num_Orders_List = Num_Orders_List[:TopN]
    Cust_Id_List = Cust_Id_List[:TopN]
    # Plot_Graph(Cust_Id_List, Num_Orders_List)

    ## Pre-Processing: Combine all data files in one consistent, sort and drop rows with missing value
    Combined_Data = Combine_Data(Sales_data, User_data, Products_data, Cust_Id_List, UserID_Col, ProductID_Col)
    Processed_Data = Data_Processing(Combined_Data)

    ## Create User_Product Matrix for CF
    User_Product_Mat = CF_Matrix()
    User_Product_Mat.Build_User_Product_Matrix(Processed_Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag=1)

    return User_Product_Mat.Matrix, Cust_Id_List, list(Products_data[ProductID_Col])

## Create User_PersonalInformation Matrix based on information filled by users
def Create_User_Matrix():
    return

def Store_Matrix(Data_Matrix, UserID_List, DataFrame_Header, FilePath):

    Matrix_DataFrame = []
    for i in range(len(UserID_List)):
        DF_Row = [UserID_List[i]]
        DF_Row.extend(list(User_Product_Matrix[i]))
        Matrix_DataFrame.append(DF_Row)

    User_Product_Dataframe = pd.DataFrame(Matrix_DataFrame, columns=DataFrame_Header)
    User_Product_Dataframe.to_csv(FilePath)

In [3]:
Data_Folder_Path = "/Users/pranjali/Downloads/SE_Project/Data/SalesDB/"

Sales_filepath = Data_Folder_Path + "sales.csv"
Sales_Col_list = ["SalesID", "CustomerID", "ProductID", "Quantity", "SalesDate"]

Products_filepath = Data_Folder_Path + "products.csv"
Products_Col_list = ["ProductID", "ProductName", "CategoryID", "IsAllergic"]

Users_filepath = Data_Folder_Path + "customers.csv"
Users_Col_list = ["CustomerID", "FirstName", "LastName", "CityID"]

UserID_Col = "CustomerID"
ProductID_Col = "ProductID"
Quantity_Col = "Quantity"

TopN = 500

In [4]:
User_Product_Matrix, UserID_List, ProductID_List = Create_Purchase_Matrix()

Attribute_List = [UserID_Col]
for i in ProductID_List:
    Attribute_List.append(i)

Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/sales.csv
number of rows, cols  (6758125, 5)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/products.csv
number of rows, cols  (452, 4)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/customers.csv
number of rows, cols  (98759, 4)


In [5]:
Store_Matrix(User_Product_Matrix[:300], UserID_List[:300], Attribute_List, "./User_Product_Matrix_Train.csv")
Store_Matrix(User_Product_Matrix[300:], UserID_List[300:], Attribute_List, "./User_Product_Matrix_Test.csv")

In [6]:
## Dimension Reduction using PCA
def PCA_Transform(Matrix, No_Dimensions):
    
    pca = PCA(n_components=No_Dimensions)
    Reduced_Matrix = pca.fit_transform(Matrix)
    Reduced_Matrix = pd.DataFrame(Reduced_Matrix)
    
    return Reduced_Matrix


## Clustering Algorithms

## 1. KMeans Clustering
def KMeans_Clustering(Matrix, No_Clusters):
    
    KMeans_Obj = KMeans(n_clusters=No_Clusters, init='k-means++').fit(Matrix)
    Model_Predict = KMeans_Obj
    Cluster_Labels = KMeans_Obj.labels_
    
    return Model_Predict, Cluster_Labels

## Using LineasSCV for prediction of labels for future data in case of transductive clustering algorithms
## reference: https://github.com/scikit-learn/scikit-learn/issues/901

## 2. Hierarchical Clustering
def Hierarchical_Clustering(Matrix, No_Clusters):
    Clustering_Obj = AgglomerativeClustering(n_clusters=No_Clusters, linkage="complete").fit(Matrix)
    Cluster_Labels= Clustering_Obj.labels_
    Model_Predict = LinearSVC().fit(Matrix, Cluster_Labels)
    
    return Model_Predict, Cluster_Labels

## 3. Spectral Clustering
def Spectral_Clustering(Matrix, No_Clusters):
    Clustering_Obj = SpectralClustering(n_clusters=No_Clusters, assign_labels="discretize", random_state=0).fit(Matrix)
    Cluster_Labels = Clustering_Obj.labels_
    Model_Predict = LinearSVC().fit(Matrix, Cluster_Labels)
 
    return Model_Predict, Cluster_Labels


## Clustering performance evaluation metrics
def Clustering_Evaluation(Matrix, Cluster_Labels, Dist_Metric='euclidean'):
    
    ## 1. The Silhouette Coefficient
    ## The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. 
    ## Scores around zero indicate overlapping clusters.
    
    Silhouette_Score = metrics.silhouette_score(Matrix, Cluster_Labels, metric=Dist_Metric)
    
    ## 2. The Calinski-Harabasz index (Variance Ratio Criteria)
    ## The index is the ratio of the sum of between-clusters dispersion and of inter-cluster dispersion for all clusters.
    ## The score is higher when clusters are dense and well separated.
    
#     CH_Index = metrics.calinski_harabasz_score(Matrix, Cluster_Labels)
    CH_Index = 0
    
    ## 3. The Davies-Bouldin index
    ## This index signifies the average ‘similarity’ between clusters, where the similarity is a measure 
    ## that compares the distance between clusters with the size of the clusters themselves.
    ## Zero is the lowest possible score. Values closer to zero indicate a better partition.
    
    DB_Index = metrics.davies_bouldin_score(Matrix, Cluster_Labels)
    
    return Silhouette_Score, CH_Index, DB_Index



def Perform_Clustering(Matrix, Clustering_Algo, No_Clusters):
    
    if Clustering_Algo=='KMeans':
        Model_Predict, Labels = KMeans_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels, 'euclidean')
                
    elif Clustering_Algo=='Hierarchical':
        Model_Predict, Labels = Hierarchical_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)
        
    elif Clustering_Algo=='Spectral':
        Model_Predict, Labels = Spectral_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)

    else:
        print("Invalid algotirthm: ", Clustering_Algo)
        
    return (Model_Predict, Labels, Evaluation_Metrics)



def Clustering_Comparison(Matrix, Clustering_AlgoList=[], No_Clusters_List=[], PCA_List=[]):
        
    Clustering_Results = []
    
    for dimension in PCA_List:
        if int(dimension) != 0:
            pca_dimension = dimension
            Reduced_Matrix = PCA_Transform(Matrix, pca_dimension)
        else:
            pca_dimension = Matrix.shape[1]
            Reduced_Matrix = Matrix
            
        for algo in Clustering_AlgoList:
            for No_Clusters in No_Clusters_List:
                
                Model_Predict, Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, algo, No_Clusters)

                print("---------------------------------------------------------------------------")
                print("Performace for algo: " + str(algo) + ", No. of clusters: "+ str(No_Clusters) + ", PCA with dimensions: " + str(pca_dimension))
                print("Silhouette Coefficient:  ", Evaluation_Metrics[0])
#                 print("Calinski-Harabasz Index: ", Evaluation_Metrics[1])
                print("Davies-Bouldin Index:    ", Evaluation_Metrics[2])
                
                Model_Details = (algo, No_Clusters, pca_dimension)
                Clustering_Results.append((Model_Details, Evaluation_Metrics))
    
#     Sorted_Result = sorted(Clustering_Results, key=lambda x: (-x[1][0], -x[1][1], x[1][2]))
    Sorted_Result = sorted(Clustering_Results, key=lambda x: (x[1][2], -x[1][0]))
    Best_Result = Sorted_Result[0]
    
    Best_Result_Eval = Best_Result[1]
    print("=========================== BEST RESULT ============================")
    print("Silhouette Coefficient:  ", Best_Result_Eval[0])
    print("Calinski-Harabasz Index: ", Best_Result_Eval[1])
    print("Davies-Bouldin Index:    ", Best_Result_Eval[2])
    
    Best_Model_Params = Best_Result[0]
    print("============================ BEST MODEL =============================")
    print("Algorithm:           ", Best_Model_Params[0])
    print("No. of clusters:     ", Best_Model_Params[1])
    print("Dimension Reduction: ", Best_Model_Params[2])
    
    return Best_Model_Params
    
## Store Clustering Algorithm Output
def Store_Clusters(Model_Predict, pca_dimension, UserID_List, Labels, Output_Folder_Path):
    
    ## Store Model Params which will be used in prediction of labels for future data.
    ## Load this model and call predict() function on 'Model_Predict' for inference.
    Saved_Model = {'Model' : Model_Predict, 'Dimensions' : pca_dimension}
    with open(Output_Folder_Path + "/Saved_Model.pkl", 'wb') as output_file:
        pickle.dump(Saved_Model, output_file)
    print("Storing trained clustering model...")
    
    ## Store UserIDs divided into clusters which will be used to train autoencoders in next phase.
    Clusters = {}
    for i in range(len(Labels)):
        
        label = Labels[i]
        UserID = UserID_List[i]
        
        if label not in Clusters.keys():
            Clusters[label] = [UserID]
        else:
            Clusters[label].append(UserID)
            
    with open(Output_Folder_Path + "/Clusters.pkl", 'wb') as output_file:
        pickle.dump(Clusters, output_file)
    print("Storing clustered UserIDs...")
    

In [7]:
Matrix_FilePath = "./User_Product_Matrix_Train.csv"
Col_List=[str(i) for i in range(1,453)]

Clustering_AlgoList = ['KMeans', 'Hierarchical', 'Spectral']
No_Clusters_List = [2, 3, 4, 5, 6, 7]
PCA_List = [0, 100, 200]

In [8]:
Matrix_Data = pd.read_csv(Matrix_FilePath, sep=',')
Matrix = Matrix_Data[Col_List] 

Best_Model = Clustering_Comparison(Matrix, Clustering_AlgoList, No_Clusters_List, PCA_List)

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.00356426411566523
Davies-Bouldin Index:     11.979578198455354
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.0015625883592200215
Davies-Bouldin Index:     10.666281793769025
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   0.0016664835400951794
Davies-Bouldin Index:     9.773605339126073
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.000787819677156442
Davies-Bouldin Index:     9.02767567887338


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.00014453463820864761
Davies-Bouldin Index:     8.479205744764107
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   0.0003931246319560526
Davies-Bouldin Index:     7.978363412532349
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.0015329517508618632
Davies-Bouldin Index:     14.283148891575497
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.0005661530663231858
Davies-Bouldin Index:     11.771011457829202
-----------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.00022334660868249265
Davies-Bouldin Index:     9.48064101987933
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.00016941790278508396
Davies-Bouldin Index:     8.474428921394466
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   -1.0541467606793355e-05
Davies-Bouldin Index:     7.800992429929379


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.0013407860358739035
Davies-Bouldin Index:     5.997475096072163
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   -0.0022141911641490983
Davies-Bouldin Index:     5.823441559639466


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   -0.007901345395552291
Davies-Bouldin Index:     5.900856088939536
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   -0.0023122080717363604
Davies-Bouldin Index:     6.827838030633117


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   -0.0029513255638792353
Davies-Bouldin Index:     6.5714690522261225
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   -0.0039049990480202403
Davies-Bouldin Index:     5.7977700906512775
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.006949033624798493
Davies-Bouldin Index:     9.646477129827321


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.005292934501496011
Davies-Bouldin Index:     8.41961967581898
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   0.005245030395761392
Davies-Bouldin Index:     7.73113597602236
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   0.0050942496980848125
Davies-Bouldin Index:     7.334826477748122
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   0.004338347463753804
Davies-Bouldin Index:     6.869821623800946
-------------------------------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.0020482082866597274
Davies-Bouldin Index:     11.885742917900734
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.000949692103264111
Davies-Bouldin Index:     10.866544324547222
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   -0.0011186364574578714
Davies-Bouldin Index:     9.794831797837249


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   -0.002394498461171583
Davies-Bouldin Index:     8.798716972174432
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   -0.0016727532923399558
Davies-Bouldin Index:     7.912211918074793
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   -0.0029290385816992865
Davies-Bouldin Index:     7.55007049819139


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.0018229646935688615
Davies-Bouldin Index:     3.6476347084827396


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.0015449464898237883
Davies-Bouldin Index:     2.9283326750708993


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   -0.006205770639038852
Davies-Bouldin Index:     4.199795727811753


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   -0.01619839060194251
Davies-Bouldin Index:     3.7724581532780164
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   -0.02880226603784994
Davies-Bouldin Index:     3.908006298939442


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   -0.017145528327669572
Davies-Bouldin Index:     5.372916438701138
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.004351697292739571
Davies-Bouldin Index:     11.243895655347917
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.002767571721222968
Davies-Bouldin Index:     10.135160780077278
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.0021261310263066013
Davies-Bouldin Index:     9.306424293494581


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   0.0018047669042093291
Davies-Bouldin Index:     8.585587107608381
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   0.001153740502652073
Davies-Bouldin Index:     8.318092118442578
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   0.00035885153021513236
Davies-Bouldin Index:     7.917908569198921
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.001613573716880828
Davies-Bouldin Index:     14.233352827083063
--------------------------

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   2.168529334454375e-05
Davies-Bouldin Index:     10.10239144779477
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   -0.0003165147292816999
Davies-Bouldin Index:     9.466379950455813
---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   -0.0007328267303033239
Davies-Bouldin Index:     8.826930013144194


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   -0.0005843078669815307
Davies-Bouldin Index:     8.319329704441218


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   -0.00032323710448786523
Davies-Bouldin Index:     7.006234171662715


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   -0.0019788799363261718
Davies-Bouldin Index:     6.800136650701185


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   -0.003324213825566458
Davies-Bouldin Index:     6.144610710578338
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   -0.007902310315181927
Davies-Bouldin Index:     5.485759149698926
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   -0.008872580968320463
Davies-Bouldin Index:     5.354095904402779
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   -0.008913370811874256
Davies-Bouldin Index:     5.314229988353718
Silhouette Coefficient: 

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [9]:
if Best_Model[2] != Matrix.shape[1]:
    Reduced_Matrix = PCA_Transform(Matrix, Best_Model[2])
else:
    Reduced_Matrix = Matrix
    
Model_Predict, Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, Best_Model[0], Best_Model[1])


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [10]:
print(Labels)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [11]:
UserIDList = list(Matrix_Data['CustomerID'])
Store_Clusters(Model_Predict, Best_Model[2], UserIDList, Labels, "./")

Storing trained clustering model...
Storing clustered UserIDs...


In [12]:
Matrix_FilePath = "./User_Product_Matrix_Test.csv"
Col_List=[str(i) for i in range(1,453)]

Test_Data = pd.read_csv(Matrix_FilePath, sep=',')
Test_Matrix = Test_Data[Col_List]

In [13]:
Model_Predict = None
with open("./Saved_Model.pkl", 'rb') as file:
    Saved_Model = pickle.load(file)

Pca_Dimensions = Saved_Model['Dimensions']
Model_Predict = Saved_Model['Model']

if Pca_Dimensions != Test_Matrix.shape[1]:
    Reduced_Matrix = PCA_Transform(Test_Matrix, Pca_Dimensions)
else:
    Reduced_Matrix = Test_Matrix

Predictions = Model_Predict.predict(Reduced_Matrix)
print(type(Predictions))

<class 'numpy.ndarray'>


In [14]:
print(Predictions.shape)

(200,)


In [15]:
print(Predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
