In [1]:
from Data_Operations import*
import numpy as np
import pandas as pd
import pickle

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import LinearSVC
from sklearn import metrics

In [2]:
class CF_Matrix:

    def __init__(self):

        self.Rows = 0
        self.Cols = 0
        self.Matrix = None

    ## Get quantity of product Prod_id purchased by user Cust_id
    def Get_Quantity(self, Data, UserID_Col, ProductID_Col, Cust_id, Prod_id, Quantity_Col):

        Cust_Id_Filter = Data.loc[Data[UserID_Col]==Cust_id]
        Prod_Id_Filter = Cust_Id_Filter.loc[Cust_Id_Filter[ProductID_Col]==Prod_id]
        Prod_Quantity = np.sum(Prod_Id_Filter[Quantity_Col])
        return Prod_Quantity

    ## Binary_Flag = 1 : Cell contains 1 if product is purchased by user
    ## Binary_Flag = 0 : Cell contains quantity of products is purchased by user
    def Build_User_Product_Matrix(self, Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag):

        Customer_Ids = Data[UserID_Col].unique()
        Product_Ids = Data[ProductID_Col].unique()
        
        self.Matrix = np.zeros((self.Rows, self.Cols))

        for Cust_Id in Customer_Ids:
            for Prod_Id in Product_Ids:

                Prod_Quantity = self.Get_Quantity(Data, UserID_Col, ProductID_Col, Cust_Id, Prod_Id, Quantity_Col)

                if Prod_Quantity > 0:
                    if Binary_Flag:
                        self.Matrix[Cust_Id][Prod_Id] = 1
                    else:
                        self.Matrix[Cust_Id][Prod_Id] = Prod_Quantity

def Create_Purchase_Matrix():

    Sales_data = Load_Data(Sales_filepath, Sales_Col_list, seperator=",")
    Products_data = Load_Data(Products_filepath, Products_Col_list, seperator=",")
    User_data = Load_Data(Users_filepath, Users_Col_list, seperator=";")

    ## Analysis of No. of purchase orders per customer.
    ## Instead of whole data, using only N users' data with maximum purchase orders
    Num_Orders_List, Cust_Id_List = VisualizeSalesPerCustomer(Sales_data, UserID_Col)

    ## Pre-Processing: Combine all data files in one consistent, sort and drop rows with missing value
    Combined_Data = Combine_Data(Sales_data, User_data, Products_data, Cust_Id_List, UserID_Col, ProductID_Col)
    Processed_Data = Data_Processing(Combined_Data)

    ## Create User_Product Matrix for CF
    User_Product_Mat = CF_Matrix()
    User_Product_Mat.Rows = User_data.shape[0]
    User_Product_Mat.Cols = Products_data.shape[0]
    User_Product_Mat.Build_User_Product_Matrix(Processed_Data, UserID_Col, ProductID_Col, Quantity_Col, Binary_Flag=1)

    return User_Product_Mat.Matrix, Cust_Id_List, list(Products_data[ProductID_Col])

## Create User_PersonalInformation Matrix based on information filled by users
def Create_User_Matrix():
    return

def Store_Matrix(Data_Matrix, UserID_List, DataFrame_Header, FilePath):

    Matrix_DataFrame = []
    for i in range(len(UserID_List)):
        DF_Row = [UserID_List[i]]
        DF_Row.extend(list(User_Product_Matrix[i]))
        Matrix_DataFrame.append(DF_Row)

    User_Product_Dataframe = pd.DataFrame(Matrix_DataFrame, columns=DataFrame_Header)
    User_Product_Dataframe.to_csv(FilePath)

In [3]:
Data_Folder_Path = "/Users/pranjali/Downloads/SE_Project/Data/SalesDB/"

Sales_filepath = Data_Folder_Path + "new_Reduced_sales.csv"
Sales_Col_list = ["SalesID", "CustomerID", "ProductID", "Quantity", "SalesDate"]

Products_filepath = Data_Folder_Path + "new_products.csv"
Products_Col_list = ["ProductID", "ProductName", "CategoryID", "IsAllergic"]

Users_filepath = Data_Folder_Path + "customers.csv"
Users_Col_list = ["CustomerID", "FirstName", "LastName", "CityID"]

UserID_Col = "CustomerID"
ProductID_Col = "ProductID"
Quantity_Col = "Quantity"

# TopN = 100

In [4]:
User_Product_Matrix, UserID_List, ProductID_List = Create_Purchase_Matrix()

Attribute_List = [UserID_Col]
for i in ProductID_List:
    Attribute_List.append(i)

Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/new_Reduced_sales.csv
number of rows, cols  (190814, 5)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/new_products.csv
number of rows, cols  (620, 4)
Loading data from  /Users/pranjali/Downloads/SE_Project/Data/SalesDB/customers.csv
number of rows, cols  (98759, 4)


In [5]:
Store_Matrix(User_Product_Matrix, UserID_List, Attribute_List, "./User_Product_Matrix.csv")
Store_Matrix(User_Product_Matrix[:1600], UserID_List[:1600], Attribute_List, "./User_Product_Matrix_Train.csv")
Store_Matrix(User_Product_Matrix[1600:], UserID_List[1600:], Attribute_List, "./User_Product_Matrix_Test.csv")

In [6]:
## Dimension Reduction using PCA
def PCA_Transform(Matrix, No_Dimensions):
    
    pca = PCA(n_components=No_Dimensions)
    Reduced_Matrix = pca.fit_transform(Matrix)
    Reduced_Matrix = pd.DataFrame(Reduced_Matrix)
    
    return Reduced_Matrix


## Clustering Algorithms

## 1. KMeans Clustering
def KMeans_Clustering(Matrix, No_Clusters):
    
    KMeans_Obj = KMeans(n_clusters=No_Clusters, init='k-means++').fit(Matrix)
    Model_Predict = KMeans_Obj
    Cluster_Labels = KMeans_Obj.labels_
    
    return Model_Predict, Cluster_Labels

## Using LineasSCV for prediction of labels for future data in case of transductive clustering algorithms
## reference: https://github.com/scikit-learn/scikit-learn/issues/901

## 2. Hierarchical Clustering
def Hierarchical_Clustering(Matrix, No_Clusters):
    Clustering_Obj = AgglomerativeClustering(n_clusters=No_Clusters, linkage="complete").fit(Matrix)
    Cluster_Labels= Clustering_Obj.labels_
    Model_Predict = LinearSVC().fit(Matrix, Cluster_Labels)
    
    return Model_Predict, Cluster_Labels

## 3. Spectral Clustering
def Spectral_Clustering(Matrix, No_Clusters):
    Clustering_Obj = SpectralClustering(n_clusters=No_Clusters, assign_labels="discretize", random_state=0).fit(Matrix)
    Cluster_Labels = Clustering_Obj.labels_
    Model_Predict = LinearSVC().fit(Matrix, Cluster_Labels)
 
    return Model_Predict, Cluster_Labels


## Clustering performance evaluation metrics
def Clustering_Evaluation(Matrix, Cluster_Labels, Dist_Metric='euclidean'):
    
    ## 1. The Silhouette Coefficient
    ## The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering. 
    ## Scores around zero indicate overlapping clusters.
    
    Silhouette_Score = metrics.silhouette_score(Matrix, Cluster_Labels, metric=Dist_Metric)
    
    ## 2. The Calinski-Harabasz index (Variance Ratio Criteria)
    ## The index is the ratio of the sum of between-clusters dispersion and of inter-cluster dispersion for all clusters.
    ## The score is higher when clusters are dense and well separated.
    
#     CH_Index = metrics.calinski_harabasz_score(Matrix, Cluster_Labels)
    CH_Index = 0
    
    ## 3. The Davies-Bouldin index
    ## This index signifies the average ‘similarity’ between clusters, where the similarity is a measure 
    ## that compares the distance between clusters with the size of the clusters themselves.
    ## Zero is the lowest possible score. Values closer to zero indicate a better partition.
    
    DB_Index = metrics.davies_bouldin_score(Matrix, Cluster_Labels)
    
    return Silhouette_Score, CH_Index, DB_Index



def Perform_Clustering(Matrix, Clustering_Algo, No_Clusters):
    
    if Clustering_Algo=='KMeans':
        Model_Predict, Labels = KMeans_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels, 'euclidean')
                
    elif Clustering_Algo=='Hierarchical':
        Model_Predict, Labels = Hierarchical_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)
        
    elif Clustering_Algo=='Spectral':
        Model_Predict, Labels = Spectral_Clustering(Matrix, No_Clusters)
        Evaluation_Metrics = Clustering_Evaluation(Matrix, Labels)

    else:
        print("Invalid algotirthm: ", Clustering_Algo)
        
    return (Model_Predict, Labels, Evaluation_Metrics)



def Clustering_Comparison(Matrix, Clustering_AlgoList=[], No_Clusters_List=[], PCA_List=[]):
        
    Clustering_Results = []
    
    for dimension in PCA_List:
        if int(dimension) != 0:
            pca_dimension = dimension
            Reduced_Matrix = PCA_Transform(Matrix, pca_dimension)
        else:
            pca_dimension = Matrix.shape[1]
            Reduced_Matrix = Matrix
            
        for algo in Clustering_AlgoList:
            for No_Clusters in No_Clusters_List:
                
                Model_Predict, Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, algo, No_Clusters)

                print("---------------------------------------------------------------------------")
                print("Performace for algo: " + str(algo) + ", No. of clusters: "+ str(No_Clusters) + ", PCA with dimensions: " + str(pca_dimension))
                print("Silhouette Coefficient:  ", Evaluation_Metrics[0])
#                 print("Calinski-Harabasz Index: ", Evaluation_Metrics[1])
                print("Davies-Bouldin Index:    ", Evaluation_Metrics[2])
                
                Model_Details = (algo, No_Clusters, pca_dimension)
                Clustering_Results.append((Model_Details, Evaluation_Metrics))
    
#     Sorted_Result = sorted(Clustering_Results, key=lambda x: (-x[1][0], -x[1][1], x[1][2]))
    Sorted_Result = sorted(Clustering_Results, key=lambda x: (x[1][2], -x[1][0]))
    Best_Result = Sorted_Result[0]
    
    Best_Result_Eval = Best_Result[1]
    print("=========================== BEST RESULT ============================")
    print("Silhouette Coefficient:  ", Best_Result_Eval[0])
    print("Calinski-Harabasz Index: ", Best_Result_Eval[1])
    print("Davies-Bouldin Index:    ", Best_Result_Eval[2])
    
    Best_Model_Params = Best_Result[0]
    print("============================ BEST MODEL =============================")
    print("Algorithm:           ", Best_Model_Params[0])
    print("No. of clusters:     ", Best_Model_Params[1])
    print("Dimension Reduction: ", Best_Model_Params[2])
    
    return Best_Model_Params
    
## Store Clustering Algorithm Output
def Store_Clusters(Model_Predict, pca_dimension, UserID_List, Labels, Output_Folder_Path):
    
    ## Store Model Params which will be used in prediction of labels for future data.
    ## Load this model and call predict() function on 'Model_Predict' for inference.
    Saved_Model = {'Model' : Model_Predict, 'Dimensions' : pca_dimension}
    with open(Output_Folder_Path + "/Saved_Model.pkl", 'wb') as output_file:
        pickle.dump(Saved_Model, output_file)
    print("Storing trained clustering model...")
    
    ## Store UserIDs divided into clusters which will be used to train autoencoders in next phase.
    Clusters = {}
    for i in range(len(Labels)):
        
        label = Labels[i]
        UserID = UserID_List[i]
        
        if label not in Clusters.keys():
            Clusters[label] = [UserID]
        else:
            Clusters[label].append(UserID)
            
    with open(Output_Folder_Path + "/Clusters.pkl", 'wb') as output_file:
        pickle.dump(Clusters, output_file)
    print("Storing clustered UserIDs...")
    

In [7]:
Matrix_FilePath = "./User_Product_Matrix_Train.csv"
Col_List=[str(i) for i in range(1,453)]

Clustering_AlgoList = ['KMeans', 'Hierarchical', 'Spectral']
No_Clusters_List = [2, 3, 4, 5, 6, 7]
PCA_List = [0, 100, 200]
UserID_Col = "CustomerID"

In [8]:
Matrix_Data = pd.read_csv(Matrix_FilePath, sep=',')
Matrix = Matrix_Data[Col_List] 

Best_Model = Clustering_Comparison(Matrix, Clustering_AlgoList, No_Clusters_List, PCA_List)

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.9772982018569379
Davies-Bouldin Index:     1.1268951264276887
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.9774686762894774
Davies-Bouldin Index:     3.368787866375447


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   0.9775836845371536
Davies-Bouldin Index:     3.0640589981568724
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.9776737832084612
Davies-Bouldin Index:     2.8184238300168327


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.9777679429536245
Davies-Bouldin Index:     2.491733713863003
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   0.9779047344365017
Davies-Bouldin Index:     2.2909855927233638


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.9636923520273806
Davies-Bouldin Index:     1.0249541826113706


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.9652755538015182
Davies-Bouldin Index:     1.9008946656062211


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   0.973295948323976
Davies-Bouldin Index:     2.6724383041746074


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.9734006253475453
Davies-Bouldin Index:     2.324959952431135


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.9734186643607393
Davies-Bouldin Index:     2.4456091408581653


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   0.9752977564028709
Davies-Bouldin Index:     2.4112696853180577


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 452
Silhouette Coefficient:   0.9635292994567283
Davies-Bouldin Index:     1.014995991504785


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 452
Silhouette Coefficient:   0.9677193332308122
Davies-Bouldin Index:     1.9041573560900387


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 452
Silhouette Coefficient:   0.9701050859131687
Davies-Bouldin Index:     2.2064651507083566


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 452
Silhouette Coefficient:   0.9706451556807103
Davies-Bouldin Index:     2.1305917654947093


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 452
Silhouette Coefficient:   0.9707136068226359
Davies-Bouldin Index:     2.01892714145676


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 452
Silhouette Coefficient:   0.9712998499043246
Davies-Bouldin Index:     1.868033612054329
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.977298201640053
Davies-Bouldin Index:     1.1268951266637819


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.9774609159773885
Davies-Bouldin Index:     3.3854028300882812
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   0.9775811855961765
Davies-Bouldin Index:     2.4105545142305154
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   0.9777540662885061
Davies-Bouldin Index:     2.7787999883228687


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   0.9777727317506194
Davies-Bouldin Index:     2.79716985923015
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   0.9778417752701446
Davies-Bouldin Index:     2.504351539561704


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.9629328029896559
Davies-Bouldin Index:     0.9376720283706825


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.9654135686743467
Davies-Bouldin Index:     1.8591925089132548


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   0.9682269311777683
Davies-Bouldin Index:     2.18085335111447


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   0.9682517098864071
Davies-Bouldin Index:     1.8525329783248616


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   0.9746541550251373
Davies-Bouldin Index:     2.3397530235557444


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   0.9747156772997672
Davies-Bouldin Index:     2.2301122165690255


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 100
Silhouette Coefficient:   0.9630389443950478
Davies-Bouldin Index:     1.002141748266082


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 100
Silhouette Coefficient:   0.9682177817973124
Davies-Bouldin Index:     1.9422642868243223


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 100
Silhouette Coefficient:   0.9701050857000074
Davies-Bouldin Index:     2.206465150708353


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 100
Silhouette Coefficient:   0.9706451554657408
Davies-Bouldin Index:     2.1305917654947133


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 100
Silhouette Coefficient:   0.9712725399126774
Davies-Bouldin Index:     2.0596417189517537


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 100
Silhouette Coefficient:   0.9719309431934972
Davies-Bouldin Index:     1.9101363800865552


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.9772982015576488
Davies-Bouldin Index:     1.1268951264335245
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.9774836713564651
Davies-Bouldin Index:     3.085716034527111


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.9775756204891977
Davies-Bouldin Index:     2.9507343260165624
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   0.9776678043574943
Davies-Bouldin Index:     2.775401511216643


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   0.9777175594224967
Davies-Bouldin Index:     2.510632504052194
---------------------------------------------------------------------------
Performace for algo: KMeans, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   0.9778830246029415
Davies-Bouldin Index:     2.4813132551594217


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.9690650908303876
Davies-Bouldin Index:     1.1129832085972997


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.9690151611844677
Davies-Bouldin Index:     2.419496194086719


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.9690353924427839
Davies-Bouldin Index:     2.284215437305933


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   0.9690622610465539
Davies-Bouldin Index:     2.121411598835903


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   0.9735785225574616
Davies-Bouldin Index:     2.2632644684773355


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Hierarchical, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   0.9736948148830393
Davies-Bouldin Index:     2.12870148647703


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 2, PCA with dimensions: 200
Silhouette Coefficient:   0.9630295898617974
Davies-Bouldin Index:     0.9892152283714113


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 3, PCA with dimensions: 200
Silhouette Coefficient:   0.9682177817165786
Davies-Bouldin Index:     1.9422642868243145


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 4, PCA with dimensions: 200
Silhouette Coefficient:   0.9706679912591636
Davies-Bouldin Index:     2.2324020282869177


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 5, PCA with dimensions: 200
Silhouette Coefficient:   0.9706451553840642
Davies-Bouldin Index:     2.13059176549471


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 6, PCA with dimensions: 200
Silhouette Coefficient:   0.9712725398309496
Davies-Bouldin Index:     2.0596417189517497
---------------------------------------------------------------------------
Performace for algo: Spectral, No. of clusters: 7, PCA with dimensions: 200
Silhouette Coefficient:   0.9713434681980162
Davies-Bouldin Index:     1.8117750137365518
Silhouette Coefficient:   0.9629328029896559
Calinski-Harabasz Index:  0
Davies-Bouldin Index:     0.9376720283706825
Algorithm:            Hierarchical
No. of clusters:      2
Dimension Reduction:  100


  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [9]:
if Best_Model[2] != Matrix.shape[1]:
    Reduced_Matrix = PCA_Transform(Matrix, Best_Model[2])
else:
    Reduced_Matrix = Matrix
    
Model_Predict, Labels, Evaluation_Metrics = Perform_Clustering(Reduced_Matrix, Best_Model[0], Best_Model[1])


  score = (intra_dists[:, None] + intra_dists) / centroid_distances


In [10]:
print(Labels)

[1 1 1 ... 1 1 1]


In [11]:
UserIDList = list(Matrix_Data['CustomerID'])
Store_Clusters(Model_Predict, Best_Model[2], UserIDList, Labels, "./")

Storing trained clustering model...
Storing clustered UserIDs...


In [12]:
Matrix_FilePath = "./User_Product_Matrix_Test.csv"
Col_List=[str(i) for i in range(1,621)]

Test_Data = pd.read_csv(Matrix_FilePath, sep=',')
Test_Matrix = Test_Data[Col_List]

In [13]:
Model_Predict = None
with open("./Saved_Model.pkl", 'rb') as file:
    Saved_Model = pickle.load(file)

Pca_Dimensions = Saved_Model['Dimensions']
Model_Predict = Saved_Model['Model']

if Pca_Dimensions != Test_Matrix.shape[1]:
    Reduced_Matrix = PCA_Transform(Test_Matrix, Pca_Dimensions)
else:
    Reduced_Matrix = Test_Matrix

Predictions = Model_Predict.predict(Reduced_Matrix)
print(type(Predictions))

<class 'numpy.ndarray'>


In [14]:
print(Predictions.shape)

(400,)


In [15]:
print(Predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
