In [1]:
# Data Processing Tools
import numpy as np
import pandas as pd

# Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [2]:
movies = Movie.objects.all()
users = User.objects.all()
profiles = Profile.objects.all()

print(movies.count(), users.count(), profiles.count())

3883 200 200


In [3]:
users_data = np.zeros((users.count(), movies.count()))
print(users_data)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
for user in users:
    for rating in user.profile.ratings.all():
        if rating.movie.id > 3883:
            continue
        users_data[user.id-1][rating.movie.id-1] += rating.rating
        
print(users_data)

[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 3. 0. ... 0. 0. 0.]
 [0. 0. 3. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [7]:
import random

def kmeans_custom_clustering_users(k, iters):
    
    # define variables and data
    ml = movies.count()
    ul = users.count()
    clustering_data = np.full((1, ul), -1)[0]
    div = np.vectorize(lambda a, b: round(a/b, 4))
    
    
    # initialize k of centroids randomly
    centroids = np.random.randint(5, size=(k, ml)) 
    
    
    for _ in range(iters):
        
        # clustering (find nearest centroid for each users_data by calculating Euclidean distance)
        for i in range(ul):
            dist = (ml*25)
            cluster = -1
            for j in range(k):
                temp = sum(((users_data[i])-centroids[j])**2)
#                 print("dist: {}, temp: {}".format(dist, temp))              
                if temp < dist:
                    dist = temp
                    cluster = j

            clustering_data[i] = cluster
        

        # adjust centroids
        centroids = np.zeros((k, ml))
        cnt_array = [0 for _ in range(k)]
        
        for i in range(ul):
            cl = clustering_data[i]
            cnt_array[cl] += 1
            centroids[cl] = np.add(centroids[cl], users_data[i])

         
        for i in range(k):
            cnt = cnt_array[i]
            if cnt == 0:
                continue
            
            centroids[i] = div(centroids[i], cnt)
            
    
    return clustering_data

In [10]:
clustering_data = kmeans_custom_clustering_users(7, 100)

print(clustering_data)

[2 2 2 5 2 2 2 2 2 4 2 5 2 5 3 5 0 4 4 5 6 4 0 2 2 4 2 2 2 5 2 5 4 2 2 4 2
 2 5 2 5 4 5 4 2 5 5 4 2 5 5 2 1 5 5 2 5 0 2 5 5 4 5 5 3 5 5 2 2 5 5 5 4 5
 2 2 5 2 5 2 2 3 5 2 5 5 2 2 5 3 2 4 2 6 2 2 2 5 2 2 2 5 3 5 2 2 2 5 2 5 2
 5 5 2 5 5 4 4 5 2 2 2 4 5 2 5 3 5 2 2 4 2 2 3 5 4 3 2 4 5 5 2 2 5 5 4 3 4
 0 4 0 6 5 5 2 2 0 5 5 5 4 2 3 5 2 4 5 5 1 2 5 5 1 2 4 2 2 2 2 5 4 2 2 2 5
 2 0 5 5 2 5 1 2 5 1 5 6 4 0 5]


In [8]:
def user_clustering(method, k):

    # K-Means
    if method == 'km':
        model = KMeans(n_clusters=k, init="random", random_state=0)
        model.fit(users_data)
        clustering_data = model.predict(users_data)

    # Hierarchy
    if method == 'hr':
        model = AgglomerativeClustering(n_clusters=k, affinity="euclidean", linkage='ward')
        clustering_data = model.fit_predict(users_data)

    # EM
    if method == 'em':
        model = GaussianMixture(n_components=k, init_params='random', random_state=0, max_iter=100)
        with ignore_warnings(category=ConvergenceWarning):
            model.fit(users_data)
        clustering_data = model.predict(users_data)
    
    return clustering_data

In [53]:
kmeans_data = user_clustering('km', 7)
kmeans_data

array([1, 0, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 0, 1, 0, 1, 3, 0, 0, 1, 1, 0,
       3, 4, 1, 0, 1, 4, 0, 1, 1, 1, 4, 4, 4, 0, 1, 1, 1, 1, 1, 3, 1, 0,
       4, 1, 1, 6, 0, 1, 1, 1, 5, 1, 1, 1, 1, 3, 4, 1, 1, 4, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 4, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 0, 1, 1, 1, 1, 6, 1, 1, 1, 0, 1, 0, 0, 4, 1,
       4, 0, 1, 4, 0, 1, 3, 1, 1, 1, 1, 1, 1, 4, 0, 6, 2, 4, 0, 1, 1, 1,
       1, 4, 0, 1, 1, 1, 2, 0, 0, 1, 1, 5, 1, 1, 6, 1, 1, 1, 2, 0, 4, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 4, 1, 1, 4, 1, 0, 4, 1, 2, 1, 1, 0,
       0, 1])

In [54]:
hr_data = user_clustering('hr', 7)
hr_data

array([6, 4, 6, 6, 4, 6, 4, 4, 6, 1, 6, 6, 6, 6, 4, 6, 4, 1, 1, 6, 6, 1,
       4, 6, 6, 4, 6, 6, 4, 6, 6, 6, 0, 6, 6, 1, 6, 6, 6, 6, 6, 4, 6, 1,
       4, 6, 6, 1, 4, 6, 6, 4, 5, 6, 6, 6, 6, 1, 6, 6, 6, 0, 6, 6, 4, 6,
       6, 6, 6, 6, 6, 6, 1, 6, 1, 6, 6, 6, 6, 6, 6, 4, 6, 6, 6, 6, 6, 6,
       6, 4, 6, 1, 4, 6, 4, 6, 6, 6, 6, 6, 4, 6, 4, 6, 4, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 1, 4, 6, 6, 6, 6, 0, 6, 4, 6, 4, 6, 4, 4, 6, 6,
       6, 4, 6, 0, 4, 6, 1, 6, 6, 6, 4, 6, 6, 0, 4, 1, 2, 0, 1, 6, 6, 6,
       6, 4, 1, 6, 6, 6, 0, 4, 4, 6, 6, 0, 6, 6, 1, 6, 6, 6, 0, 4, 0, 6,
       6, 6, 6, 6, 1, 6, 6, 6, 4, 6, 2, 6, 6, 6, 6, 1, 6, 6, 3, 6, 6, 1,
       1, 6], dtype=int64)

In [55]:
em_data = user_clustering('em', 7)
em_data

array([1, 1, 6, 6, 3, 3, 3, 3, 6, 5, 2, 6, 5, 2, 5, 4, 6, 3, 2, 5, 3, 2,
       1, 3, 6, 2, 2, 4, 5, 6, 0, 2, 1, 1, 2, 6, 2, 4, 4, 2, 6, 5, 5, 0,
       2, 3, 5, 5, 2, 6, 5, 2, 4, 6, 4, 1, 3, 2, 2, 6, 1, 1, 4, 1, 3, 4,
       6, 5, 2, 5, 2, 6, 1, 1, 1, 2, 5, 5, 1, 6, 2, 2, 5, 3, 1, 0, 2, 3,
       3, 1, 5, 5, 4, 3, 1, 3, 2, 4, 2, 6, 0, 2, 5, 6, 0, 6, 6, 3, 4, 3,
       4, 5, 6, 3, 6, 1, 1, 6, 1, 3, 5, 3, 6, 4, 2, 3, 1, 0, 0, 2, 2, 6,
       6, 4, 1, 6, 6, 0, 3, 1, 4, 2, 3, 4, 1, 4, 1, 3, 6, 6, 4, 1, 3, 6,
       5, 3, 5, 5, 2, 5, 5, 5, 2, 2, 6, 0, 1, 5, 2, 5, 1, 5, 3, 1, 3, 2,
       4, 3, 2, 0, 5, 1, 2, 4, 3, 3, 3, 5, 3, 3, 2, 6, 1, 2, 3, 1, 0, 3,
       0, 1], dtype=int64)

In [30]:
for i in range(users.count()):
    user = users[i]
    user.profile.cluster = kmeans_data[i]
    user.profile.save()

In [41]:
related_profiles = profiles.filter(cluster__exact=3)
for profile in related_profiles:
    print(profile.user.username)

user10
user17
user23
user42
user58
user117
user139
