In [13]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import math

Load movie dataset

In [14]:
# load dataset
df_movie = pd.read_csv("./data/movie_genres.csv")
df_user = pd.read_csv("./data/user_reviews.csv")

In [15]:
# movie categories
df_movie

Unnamed: 0.1,Unnamed: 0,movie_title,genre_action,genre_adventure,genre_animation,genre_biography,genre_comedy,genre_crime,genre_documentary,genre_drama,...,genre_mystery,genre_news,genre_reality-tv,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_war,genre_western
0,0,The Net,1,0,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
1,1,Happily N'Ever After,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,Tomorrowland,1,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,3,American Hero,1,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4,Das Boot,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,Big Fish,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1996,1996,Get Real,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1997,1997,Trading Places,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,1998,DOA: Dead or Alive,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
"""
Ratings for each movie and user combination
Remove unneeded 'Unnamed' column and map user name to an id as the row indices
"""
# user to id mapping
user_ids = {name: i for (i, name) in enumerate(df_user["User"])}
movie_ids = {name: i for (i, name) in enumerate(df_movie["movie_title"])}

# drop unnamed and user columns
df_user = df_user.drop(["Unnamed: 0", "User"], axis=1)

# convert to sparse matrix to save memory
mat_movie_features = csr_matrix(df_user.values)

In [17]:
def reverse_map_movie(idx: int) -> str:
    keys = list(movie_ids.keys())
    values = list(movie_ids.values())

    pos = values.index(idx)
    return keys[pos]

def reverse_map_user(idx: int) -> str:
    keys = list(user_ids.keys())
    values = list(user_ids.values())

    pos = values.index(idx)
    return keys[pos]

In [18]:
df_user = df_user.transpose()
df_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,590,591,592,593,594,595,596,597,598,599
The Net,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
Happily N'Ever After,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tomorrowland,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
American Hero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Das Boot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Big Fish,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Get Real,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Trading Places,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOA: Dead or Alive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# training data
X = df_user.to_numpy()

In [20]:
#make an object for the NearestNeighbors Class.
model_knn = NearestNeighbors(metric='minkowski', algorithm='brute', n_neighbors=10, n_jobs=-1)
model_knn.fit(X)

NearestNeighbors(algorithm='brute', n_jobs=-1, n_neighbors=10)

In [21]:
def make_recommendation(model_knn, data, fav_movie, n_recommendations):
    # fit
    model_knn.fit(data)
    # get input movie index
    idx = movie_ids[fav_movie]
    print(idx, fav_movie)

    distances, indices = model_knn.kneighbors(
        np.array([data[idx]]), n_neighbors=n_recommendations + 1
    )

    raw_recommends = sorted(
        list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())),
        key=lambda x: x[1],
    )[:0:-1]
    recommends = [(reverse_map_movie(idx), dist) for idx, dist in raw_recommends]

    return recommends


In [30]:
# make recommendations
users = ["Vincent", "Edgar", "Addilyn", "Marlee", "Javier"]
uids = [user_ids[name] for name in users]  # reverse map user names to ids
favs = [
    df_user[uid].nlargest(n=1).index[0] for uid in uids
]  # find favorite movie of user
user_fav_map = dict(zip(users, favs))

recommends = {
    user: make_recommendation(
        model_knn=model_knn,
        data=X,
        fav_movie=fav_movie,
        n_recommendations=10,
    )
    for user, fav_movie in zip(users, favs)
}


324 The Importance of Being Earnest
147 The Lovely Bones
172 The Magic Sword: Quest for Camelot
116 Now You See Me 2
806 The Incredibles


In [31]:
# filter recommendations for already rated movies and trim to five items
filtered = {}
for user, results in recommends.items():
    uid = user_ids[user]
    results = list(
        filter(lambda result: math.isclose(df_user[uid][result[0]], 0.0), results)
    )  # filter already rated movies
    results = results[:5] if len(results) >= 5 else results # trim to five items
    
    filtered[user] = results

In [33]:
for user, results in filtered.items():
    print(f"User: {user}")
    print(f"Favorite movie: {user_fav_map[user]}")
    print("Recommendations:")
    for i, (title, dist) in enumerate(results):
        print(f"{i+1}: {title}")
    print()


User: Vincent
Favorite movie: The Importance of Being Earnest
Recommendations:
1: Goal! The Dream Begins
2: Tarnation
3: Pale Rider
4: The Final Destination
5: Frenzy

User: Edgar
Favorite movie: The Lovely Bones
Recommendations:
1: Frenzy
2: The Ballad of Gregorio Cortez
3: The Men Who Stare at Goats
4: 12 Rounds
5: America Is Still the Place

User: Addilyn
Favorite movie: The Magic Sword: Quest for Camelot
Recommendations:
1: The Man in the Iron Mask
2: Niagara
3: One Missed Call
4: Bleeding Hearts
5: Ida

User: Marlee
Favorite movie: Now You See Me 2
Recommendations:
1: Treasure Planet
2: I Am Love
3: Pitch Black
4: Pretty Woman
5: Diamond Ruff

User: Javier
Favorite movie: The Incredibles
Recommendations:
1: Frenzy
2: The Final Destination
3: The Men Who Stare at Goats
4: The Ballad of Gregorio Cortez
5: 12 Rounds



In [438]:
# test if they users havent seen the movies yet
for user, results in filtered.items():
    for i, (title, dist) in enumerate(results):
        uid = user_ids[user]
        score = df_user[uid][title]
        assert math.isclose(score, 0.0), f"{user} already rated '{title}'"

User: Vincent
User: Edgar
User: Addilyn
User: Marlee
User: Javier
