In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

I'll do some simple exploratory analysis to get my head around the data.

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
movie_count = ratings.groupby('movieId').size().sort_values(ascending=False)
top10_count = movie_count.head(10).index.tolist()

print('Top 10 movies by rating')
movies[movies['movieId'].isin(top10_count)][['movieId', 'title']]

Top 10 movies by rating


Unnamed: 0,movieId,title
97,110,Braveheart (1995)
224,260,Star Wars: Episode IV - A New Hope (1977)
257,296,Pulp Fiction (1994)
277,318,"Shawshank Redemption, The (1994)"
314,356,Forrest Gump (1994)
418,480,Jurassic Park (1993)
461,527,Schindler's List (1993)
507,589,Terminator 2: Judgment Day (1991)
510,593,"Silence of the Lambs, The (1991)"
1939,2571,"Matrix, The (1999)"


#### Start by building a baseline using popularity (simple, works, might not be as accurate)

In [6]:
def top_popular(n=10):
    counts = ratings.groupby('movieId').size().rename('count')
    avg = ratings.groupby('movieId')['rating'].mean().rename('avg')
    stats = pd.concat([counts, avg], axis=1).reset_index()
    merged = stats.merge(movies[['movieId','title']], on='movieId', how='left')
    
    # sort by count then average rating
    top = merged.sort_values(['count','avg'], ascending=[False, False]).head(n)
    
    return top[['movieId','title','count','avg']]

In [7]:
top_popular(5)

Unnamed: 0,movieId,title,count,avg
314,356,Forrest Gump (1994),329,4.164134
277,318,"Shawshank Redemption, The (1994)",317,4.429022
257,296,Pulp Fiction (1994),307,4.197068
510,593,"Silence of the Lambs, The (1991)",279,4.16129
1938,2571,"Matrix, The (1999)",278,4.192446


### Collaborative filtering (cosine on user vectors)
I decided to use collaborative filtering because it focuses on behavior patternsâ€”ratings, watch history, preferences instead of movie characteristics or genres, i.e. "Which users are most similar to me, and what did they like that I haven't seen yet?"  If two users rate many of the same movies similarly, they're considered "neighbors."  Those neighbors' opinions can be used to recommend new movies.


##### Cosine similarities

I will also need to center the ratings because some users rate really high or really low, and this can help reduce some bias.
The cosine similarity is based on the centered ratings from all users. If it's a high cosine, then there are similar taste patterns across the users.

I didn't know which movies were in here, so I added a bonus fallback to just use basic popularity in case they picked a movie that wasn't available to use as a recommendation seed.

In [8]:
user_movie = ratings.pivot_table(index='userId', columns='movieId', values='rating')
# Keep mask to know what was originally rated
user_movie_mask = ~user_movie.isna()

# center the user ratings to reduce bias.  A "0" means no signal
user_means = user_movie.mean(axis=1)
user_movie_centered = user_movie.sub(user_means, axis=0)
item_user_matrix = user_movie_centered.T.fillna(0)   

# NOTE: I think this approach works fine for this database but probably would start to be too slow on a larger dataset than this small one
item_ids = item_user_matrix.index.tolist()
sim_matrix = pd.DataFrame(
    cosine_similarity(item_user_matrix),
    index=item_ids,
    columns=item_ids
)

#### I need a function to find the movie title in the data set (also using close matches in case they misspell or don't type it perfectly)

There are probably smarter ways to do this, but i just went with a exact, then substring, then a fuzzy match.  I realized I should probably do a fallback so if I add that then this is probably overkill for this assignment. I misstyped movie titles a ~few~ so, so many times while testing, so I figured it would be pretty easy to just use that popular recommender as a fallback.

In [9]:
title_to_id = {title.lower(): mid for mid, title in zip(movies['movieId'], movies['title'])}
lowercase_titles = list(title_to_id.keys())

def find_movie_id(query):
    q = query.strip().lower()

    if q in title_to_id:
        return title_to_id[q]
        
    substr = [t for t in lowercase_titles if q in t]
    if substr:
        best = sorted(substr, key=lambda s:(len(s), s))[0]
        return title_to_id[best]

    close = get_close_matches(q, lowercase_titles, n=1, cutoff=0.55)
    if close:
        return title_to_id[close[0]]
        
    return None

#### This function should take a movie, find it in the matrix, then get anything similar

In [10]:
def recommend_similar_movies(movie_title, top_n=10, exclude_self=True):
    mid = find_movie_id(movie_title)
    
    if mid is None or mid not in sim_matrix.index:
        print(f"No good match found for '{movie_title}'. Showing popular movies instead.")
        return top_popular(top_n)
      
    sims = sim_matrix.loc[mid].sort_values(ascending=False)
    if exclude_self and mid in sims.index:
        sims = sims.drop(index=mid)
    top = sims.head(top_n).reset_index()
    top.columns = ['movieId','similarity']
    
    top = top.merge(movies[['movieId','title','genres']], on='movieId', how='left')
    
    return top[['movieId','title','genres','similarity']]

#### Test both a match and the popular recommenders

In [13]:
movie = input("Type a movie you like: ")

recs = recommend_similar_movies(movie, top_n=10)

print("\nRecommendations (item-item CF):")
print(recs.to_string(index=False))

Type a movie you like:  Braveheart



Recommendations (item-item CF):
 movieId                                     title                           genres  similarity
     589         Terminator 2: Judgment Day (1991)                    Action|Sci-Fi    0.342890
     356                       Forrest Gump (1994)         Comedy|Drama|Romance|War    0.337623
     457                      Fugitive, The (1993)                         Thriller    0.318631
     318          Shawshank Redemption, The (1994)                      Crime|Drama    0.316303
    2571                        Matrix, The (1999)           Action|Sci-Fi|Thriller    0.249881
     480                      Jurassic Park (1993) Action|Adventure|Sci-Fi|Thriller    0.246818
    2028                Saving Private Ryan (1998)                 Action|Drama|War    0.245672
      32 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)          Mystery|Sci-Fi|Thriller    0.244681
     778                      Trainspotting (1996)               Comedy|Crime|Drama    0.236235
    232

In [12]:
movie = input("Type a movie you like: ")

recs = recommend_similar_movies(movie, top_n=10)

print("\nRecommendations (item-item CF):")
print(recs.to_string(index=False))

Type a movie you like:  Totally borked not-real movie


No good match found for 'Totally borked not-real movie'. Showing popular movies instead.

Recommendations (item-item CF):
 movieId                                     title  count      avg
     356                       Forrest Gump (1994)    329 4.164134
     318          Shawshank Redemption, The (1994)    317 4.429022
     296                       Pulp Fiction (1994)    307 4.197068
     593          Silence of the Lambs, The (1991)    279 4.161290
    2571                        Matrix, The (1999)    278 4.192446
     260 Star Wars: Episode IV - A New Hope (1977)    251 4.231076
     480                      Jurassic Park (1993)    238 3.750000
     110                         Braveheart (1995)    237 4.031646
     589         Terminator 2: Judgment Day (1991)    224 3.970982
     527                   Schindler's List (1993)    220 4.225000
