In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")


from scipy.cluster.hierarchy import ward, complete, fcluster
from scipy.spatial.distance import pdist,cosine,cdist
from sklearn.cluster import KMeans
from sklearn import metrics

In [5]:
def ward_clustering(X,vectorColumns,threshold):
    '''Ward clustering'''
    m,n = X.shape
    if m>2:
        pairwise_distance = pdist(X[vectorColumns], metric='euclidean')

        labels = fcluster(ward(pairwise_distance), t=threshold, criterion='distance')
        num_clusters = labels.max()
        return labels
    else: 
        return X.index.values
    
def complete_clustering(X,vectorColumns,threshold):
    '''Complete-link clustering'''
    m,n = X.shape
    if m>2:
        pairwise_distance = pdist(X[vectorColumns], metric='euclidean')

        labels = fcluster(complete(pairwise_distance), t=threshold, criterion='distance')
        num_clusters = labels.max()
        return labels
    else: 
        return X.index.values

def kmeans_clustering(X,vectorColumns, n_clusters, random_state=42):
    '''KMeans clustering'''
    m,n = X.shape
    if m>2:
        model = KMeans(n_clusters=n_clusters, random_state=random_state)
        model.fit(X[vectorColumns])
        return model.labels_
    else:
        return X.index.values
    
      
def f_silhouette_score(df, labels, scores):
    '''Calculating average Silhouette score over all users clusterings''' 
    if labels.nunique() < 2 or labels.nunique() == len(df):
        scores.append(9999)
    else:
        score = metrics.silhouette_score(df[vectorColumns], labels, metric = 'euclidean')
        scores.append(score)

In [19]:
# it takes about 1 hour to run this


threshold_w = 3
threshold_c = 1

# lists to aggregate scores over users' clusterings
ward_matrix_score = []
kmeans_ward_matrix_score = []
kmeans_k_matrix_score = []
complete_matrix_score = []

history = 'generate/user_history_19.csv'
df = pd.read_csv(history)

df_news_embedding = pd.read_csv('generate/news_embedding.csv')
df_news_meta = pd.read_csv('generate/news_cleaned.csv')

df = df.merge(df_news_embedding,on='NID')
df = df.merge(df_news_meta,on='NID')

vectorColumns = [c for c in df.columns if c.startswith('V')]

# list of unique users
users = df['UID'].unique().tolist()[1:]
# print(len(users))

# create clustering for each user
for user in users:
    df_user = df[df['UID'] == user]
    
    if len(df_user)<100:
        continue


    # WARD clustering
    df_user.loc[df_user['UID'] == user, 'labels_ward'] = ward_clustering(df_user,vectorColumns,threshold_w)
    
    # COMPLETE LINK clustering
    df_user.loc[df_user['UID'] == user, 'labels_complete'] = complete_clustering(df_user,vectorColumns,threshold_c)
    
    # KMEANS clustering based on Ward
    n = df_user['labels_ward'].nunique()
    df_user.loc[df_user['UID'] == user, 'labels_kmeans_ward'] = kmeans_clustering(df_user,vectorColumns,n)
    
    # KMEANS clustering with k = 5
    n = 10
#     if len(df_user)<5:
#         n = len(df_user) - 1
    df_user.loc[df_user['UID'] == user, 'labels_kmeans_k'] = kmeans_clustering(df_user,vectorColumns,n)

    
    # WARD silhouette_score
    f_silhouette_score(df_user, df_user['labels_ward'], ward_matrix_score)

    # COMPLETE silhouette_score
    f_silhouette_score(df_user, df_user['labels_complete'], complete_matrix_score)
    
    # KMEANS based on Ward silhouette_score
    f_silhouette_score(df_user, df_user['labels_kmeans_ward'], kmeans_ward_matrix_score)
    
    # KMEANS k = 5 silhouette_score
    f_silhouette_score(df_user, df_user['labels_kmeans_k'], kmeans_k_matrix_score)

In [20]:
# when clustering procedure ended with one cluster for all samples or with one cluster for each sample
# it is impossible to calculate Silhouette score
# these cases were excluded

def delete_9999(lst):
    lst = [i for i in lst if i != 9999]
    return lst

ward_matrix_score = delete_9999(ward_matrix_score)
complete_matrix_score = delete_9999(complete_matrix_score)
kmeans_ward_matrix_score = delete_9999(kmeans_ward_matrix_score)
kmeans_k_matrix_score = delete_9999(kmeans_k_matrix_score)

print('Silhouette score for Ward algorithm:', round(np.mean(ward_matrix_score), 3))
print('Silhouette score for Complete-link algorithm:', round(np.mean(complete_matrix_score), 3))
print('Silhouette score for KMeans based on Ward algorithm:', round(np.mean(kmeans_ward_matrix_score), 3))
print('Silhouette score for KMeans with k=5 algorithm:', round(np.mean(kmeans_k_matrix_score), 3))

Silhouette score for Ward algorithm: 0.217
Silhouette score for Complete-link algorithm: 0.221
Silhouette score for KMeans based on Ward algorithm: 0.231
Silhouette score for KMeans with k=5 algorithm: 0.124


In [21]:
len(ward_matrix_score)

108

In [13]:
len(complete_matrix_score)

0