In [133]:
# Data Processing Tools
import numpy as np
import scipy as scp
import pandas as pd

# Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

In [134]:
movies = Movie.objects.all()
users = User.objects.all()

print(movies.count(), users.count())

3883 200


In [135]:
movies_data = np.zeros((movies.count(), users.count()))
print(pd.DataFrame(movies_data))

      0    1    2    3    4    5    6    7    8    9    ...  190  191  192  \
0     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4     0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
3878  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3879  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3880  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3881  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3882  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

      193  194  195  196  197  198  199  
0     0.0  0.0  0.0  

In [136]:
for movie in movies:
    for rating in movie.ratings.all():
        if movie.id > 3883:
            continue
        movies_data[movie.id-1][rating.user.id-1] += rating.rating
        
print(movies_data)

[[5. 0. 0. ... 5. 0. 0.]
 [0. 0. 0. ... 3. 0. 0.]
 [0. 0. 0. ... 0. 3. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [172]:
import random


def kmeans_custom_clustering_movies(k, iters):
   
    # define variables, data, and function
    ml = movies.count()
    ul = users.count()
    clustering_data = np.full((1, ml), -1)[0]
    div = np.vectorize(lambda a, b: round(a/b, 4))
    
    
    # initialize k of centroids randomly
    centroids = np.random.randint(5, size=(k, ul))

    
    for _ in range(iters):
        
        # clustering (find nearest centroid for each movie_data by calculating Euclidean distance)
        for i in range(ml):
            dist = (ul*25)
            cluster = -1
            for j in range(k):
                temp = sum(((movies_data[i])-centroids[j])**2)
#                 print("dist: {}, temp: {}".format(dist, temp))              
                if temp < dist:
                    dist = temp
                    cluster = j

            clustering_data[i] = cluster
        
        # adjust elems of each centroid
        centroids = np.zeros((k, ul))
        cnt_array = [0 for _ in range(k)]
        
        for i in range(ml):
            cl = clustering_data[i]
            cnt_array[cl] += 1
            centroids[cl] = np.add(centroids[cl], movies_data[i])
            
        for i in range(k):
            cnt = cnt_array[i]
            if cnt == 0:
                continue
            centroids[i] = div(centroids[i], cnt)
            
    
    return clustering_data

In [170]:
clustering_data = kmeans_custom_clustering_movies(7, 100)

print(clustering_data, len(clustering_data))


[4 0 3 ... 3 3 3] 3883


In [37]:
def movie_clustering(method, k):
    
    # K-Means
    if method == 0:
        model = KMeans(n_clusters=k, init="random", random_state=0)
        model.fit(movies_data)
        clustering_data = model.predict(movies_data) 

    # Hierarchy
    if method == 1:
        model = AgglomerativeClustering(n_clusters=k, affinity="euclidean", linkage='ward')
        clustering_data = model.fit_predict(movies_data)

    # EM
    if method == 2:
        model = GaussianMixture(n_components=k, init_params='random', random_state=0, max_iter=100)
        with ignore_warnings(category=ConvergenceWarning):
            model.fit(movies_data)
        clustering_data = model.predict(movies_data)
      
    return clustering_data

In [127]:
kmeans_data = movie_clustering(0, 7)
kmeans_data

array([5, 6, 4, ..., 4, 4, 4])

In [128]:
hr_data = movie_clustering(1, 7)
hr_data

array([4, 6, 3, ..., 3, 3, 3], dtype=int64)

In [129]:
em_data = movie_clustering(2, 7)
em_data

array([1, 6, 4, ..., 1, 2, 3], dtype=int64)

In [42]:
for i in range(movies.count()):
    movie = movies[i]
    movie.cluster = kmeans_data[i]
    movie.save()

In [124]:
related_movies = movies.filter(cluster__exact=4)
for movie in related_movies:
    print(movie.title)

Grumpier Old Men (1995)
Waiting to Exhale (1995)
Father of the Bride Part II (1995)
Tom and Huck (1995)
Sudden Death (1995)
Dracula: Dead and Loving It (1995)
Balto (1995)
Nixon (1995)
Ace Ventura: When Nature Calls (1995)
Money Train (1995)
Assassins (1995)
Othello (1995)
Now and Then (1995)
Persuasion (1995)
Dangerous Minds (1995)
Wings of Courage (1995)
Across the Sea of Time (1995)
It Takes Two (1995)
Cry, the Beloved Country (1995)
Richard III (1995)
Dead Presidents (1995)
Restoration (1995)
How to Make an American Quilt (1995)
When Night Is Falling (1995)
Guardian Angel (1994)
Lamerica (1994)
Big Green, The (1995)
Georgia (1995)
Kids of the Round Table (1995)
Home for the Holidays (1995)
Confessional, The (Le Confessionnal) (1995)
Eye for an Eye (1996)
Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)
Two if by Sea (1996)
Bio-Dome (1996)
Lawnmower Man 2: Beyond Cyberspace (1996)
Two Bits (1995)
French Twist (Gazon maudit) (1995)
Friday (1995)
Fair Ga

Relic, The (1997)
Island of Dr. Moreau, The (1996)
First Kid (1996)
Trigger Effect, The (1996)
Sweet Nothing (1995)
Bogus (1996)
Bulletproof (1996)
Talk of Angels (1998)
Land Before Time III: The Time of the Great Giving (1995)
1-900 (1994)
Baton Rouge (1988)
Twelfth Night (1996)
Mother Night (1996)
Liebelei (1933)
Venice/Venice (1992)
Wild Reeds (1994)
For Whom the Bell Tolls (1943)
Philadelphia Story, The (1940)
Singin' in the Rain (1952)
American in Paris, An (1951)
Funny Face (1957)
Breakfast at Tiffany's (1961)
Vertigo (1958)
It Happened One Night (1934)
Gaslight (1944)
Gay Divorcee, The (1934)
North by Northwest (1959)
Meet Me in St. Louis (1944)
Rebecca (1940)
Foreign Correspondent (1940)
Affair to Remember, An (1957)
Ninotchka (1939)
Gigi (1958)
Adventures of Robin Hood, The (1938)
Ghost and Mrs. Muir, The (1947)
Lost Horizon (1937)
To Be or Not to Be (1942)
My Man Godfrey (1936)
Giant (1956)
East of Eden (1955)
Thin Man, The (1934)
His Girl Friday (1940)
Around the World in 80

Eve's Bayou (1997)
Stripes (1981)
Mad City (1997)
Tango Lesson, The (1997)
Deceiver (1997)
Boogie Nights (1997)
Witness (1985)
Incognito (1997)
Starship Troopers (1997)
Critical Care (1997)
Chairman of the Board (1998)
Mortal Kombat: Annihilation (1997)
Truman Show, The (1998)
Wings of the Dove, The (1997)
I Love You, I Love You Not (1996)
Jackal, The (1997)
Man Who Knew Too Little, The (1997)
Alien Escape (1995)
Amistad (1997)
Big Bang Theory, The (1994)
Boys, Les (1997)
Deconstructing Harry (1997)
Flubber (1997)
For Richer or Poorer (1997)
Good Will Hunting (1997)
Harlem River Drive (1996)
Ill Gotten Gains (1997)
Legal Deceit (1997)
Midnight in the Garden of Good and Evil (1997)
Mouse Hunt (1997)
Never Met Picasso (1996)
Office Killer (1997)
Other Voices, Other Rooms (1997)
Scream 2 (1997)
Stranger in the House (1997)
Sweet Hereafter, The (1997)
Twisted (1996)
Education of Little Tree, The (1997)
Horse Whisperer, The (1998)
Jackie Brown (1997)
Mr. Magoo (1997)
Big Lebowski, The (1998

Bats (1999)
Best Man, The (1999)
Crazy in Alabama (1999)
Three to Tango (1999)
Body Shots (1999)
Brother, Can You Spare a Dime? (1975)
Guardian, The (1990)
Melvin and Howard (1980)
For Your Eyes Only (1981)
Licence to Kill (1989)
Live and Let Die (1973)
Rawhead Rex (1986)
Thunderball (1965)
City, The (1998)
House on Haunted Hill, The (1999)
Music of the Heart (1999)
Being John Malkovich (1999)
Dreaming of Joseph Lees (1998)
Man of the Century (1999)
Princess Mononoke, The (Mononoke Hime) (1997)
Suburbans, The (1999)
My Best Fiend (Mein liebster Feind) (1999)
Train of Life (Train De Vie) (1998)
Bone Collector, The (1999)
Insider, The (1999)
American Movie (1999)
Last Night (1998)
Bride of Re-Animator (1990)
General, The (1927)
My Best Girl (1927)
Rough Night in Jericho (1967)
Slaughterhouse (1987)
Taming of the Shrew, The (1967)
Nighthawks (1981)
Yojimbo (1961)
Repossessed (1990)
Omega Man, The (1971)
Spaceballs (1987)
Little Big Man (1970)
Face in the Crowd, A (1957)
Trading Places (19

In [98]:
a = np.array([1,2,3,4,5])
b = np.array([2,4,6,8,10])
print(sum((a-b)**2)**(1/2))

7.416198487095663
