In [None]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.cluster.hierarchy import dendrogram

In [None]:
df = pd.read_csv('vendors_cleaned.csv', index_col=[0])

In [None]:
df.head()


## K-means clustering
### chosing k using 'elbow test'

In [None]:
# data-structure to store Sum-Of-Square-Errors
sse = {}
# Looping over multiple values of k from 1 to 15
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100).fit(df)
    df["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
# Plotting the curve with 'k'-value vs SSE
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.savefig('elbow_method.png')

Sharp drop of sum of squares error was achieved with k=6

In [None]:
best_k = 6
kmeans = KMeans(n_clusters=best_k, init='k-means++', max_iter=100).fit(df)
df["clusters"] = kmeans.labels_

In [None]:
df['clusters'].value_counts()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.loc[df['clusters'] == 0].corr())

In [None]:
df.loc[df['clusters'] == 0].describe()


## Aglomerative clustering
Using ward measure as linkage criterion, euclidean distance as metric used to compute linkage

In [None]:
agglo_clustering = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(df)
df['agglomerative'] = agglo_clustering.labels_

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    plt.figure(figsize=(20,10))
    dendrogram(linkage_matrix, **kwargs)
    plt.savefig('dendogram.png')

In [None]:
plot_dendrogram(agglo_clustering)

In [None]:
plot_dendrogram(agglo_clustering, truncate_mode='level', p=3)

In [97]:
agglo_clustering = AgglomerativeClustering(n_clusters=7).fit(df)
df['agglomerative'] = agglo_clustering.labels_

In [None]:
df['agglomerative'].value_counts()

In [None]:
def predict_top_n_items_for_item_id(item_id: int, n: int, model: str):
    item_id_row = df.loc[df['id'] == item_id]
    
    if model == "Kmeans":
        cluster_id = item_id_row['clusters'].to_numpy()[0]
        items_with_cluster_id = df.loc[df['clusters'] == cluster_id]
    elif model == "Agglomerative":
        cluster_id = item_id_row['agglomerative'].to_numpy()[0]
        items_with_cluster_id = df.loc[df['agglomerative'] == cluster_id]
    similarities = {}
    for id, item in items_with_cluster_id.iterrows():
        similarities[item['id']] = cosine_similarity(item_id_row.to_numpy().reshape(1, -1), item.to_numpy().reshape(1, -1))
    similarities_sorted = {k: v for k, v in sorted(similarities.items(), key=lambda item: item[1], reverse=True)[1:n+1]}
    result = {k: v.reshape(1,)[0] for k, v in similarities_sorted.items()}
    return result
    

In [None]:
list_predicted_kmeans = predict_top_n_items_for_item_id(13, n=15, model="Kmeans")

In [None]:
predicted_ids = list_predicted_kmeans.keys()
print(predicted_ids)
df.loc[df['id'].isin(predicted_ids)]


In [None]:
list_predicted_aglo = predict_top_n_items_for_item_id(13, n=15, model="Agglomerative")

In [None]:
predicted_ids = list_predicted_aglo.keys()
print(predicted_ids)
df.loc[df['id'].isin(predicted_ids)]


In [None]:
test_user_orders = pd.read_csv('test_user_orders.csv')

In [101]:
from typing import List, Tuple

def validate_recommendations(method: str, N: int, user_ids: List[int]) -> Tuple[int, int]:
    precision_at_N = []
    recall_at_N = []
    
    for user_id in user_ids:
        vendor_list = test_user_orders.loc[test_user_orders['customer_id'] == user_id]['vendor_id'].to_list()
        half = int(len(vendor_list) / 2) + 1
        train_vendors = vendor_list[:half]
        test_vendors = vendor_list[half:]
        
        all_recommended_vendors = []
        for vendor in train_vendors:
            predicted = predict_top_n_items_for_item_id(vendor, N, method)
            predicted = [[k, v] for k, v in predicted.items()]
            all_recommended_vendors.extend(predicted)
        # print(all_recommended_vendors)
        all_rec_filtered = [[id, sim] for [id, sim] in all_recommended_vendors if id not in train_vendors]
        all_rec_filtered.sort(key=lambda x: x[1], reverse=True)
        all_rec_unique = [item for n, item in enumerate(all_rec_filtered) if item[0] not in [i[0] for i in all_rec_filtered[:n]]]
        all_rec_unique_ids = [item[0] for item in all_rec_unique]
        
        n_relevant = len(list(set(test_vendors).intersection(all_rec_unique_ids)))
        n_recommended = len(all_rec_unique)
        precision = n_relevant / n_recommended
        recall = n_relevant/ (100 - len(train_vendors))
            
        precision_at_N.append(precision)
        recall_at_N.append(recall)
    
    
    # print(precision_at_N)
    # print(recall_at_N)
    f_p = sum(precision_at_N) / len(precision_at_N)
    f_r = sum(recall_at_N) / len(recall_at_N)
    
    print(f'Precision at {N}: {round(f_p, 3)}')
    print(f'Recall at {N}: {round(f_r, 3)}')
    return f_p, f_r
    

In [89]:
test_user_ids = test_user_orders['customer_id'].unique().tolist()

In [104]:
validate_recommendations('Kmeans', 20, test_user_ids)


Precision at 20: 0.039
Recall at 20: 0.011


(0.039123510920686784, 0.01068405399222146)

In [105]:

validate_recommendations('Agglomerative', 20, test_user_ids)

Precision at 20: 0.041
Recall at 20: 0.012


(0.04102290173842331, 0.011759322809425762)