In [43]:
import pandas as pd

### Reading in movie data ###

In [44]:
# Downloaded data from https://files.grouplens.org/datasets/movielens/ml-25m.zip

In [45]:
movies = pd.read_csv("movies.csv")

In [46]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


### Using Regex to clean the movie titles ###

##### I'm doing this to get rid of extra characters that can make it hard to search for movie titles (like parentheses, and other special characters) #####

In [47]:
import re

def clean_movie_title(title):
    return re.sub("[^a-zA-z0-9 ]", "", title)

In [48]:
# Create a new column "clean_title" that will have the cleaned version of each movie title
# Apply the function I just created to do this

movies["clean_title"] = movies["title"].apply(clean_movie_title)

In [49]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2)) # So, it will read "Toy Story 1995" as "Toy Story" and "Story 1995", helps make search more accurate
tfidf = vectorizer.fit_transform(movies["clean_title"]) # transforms titles to numbers

In [51]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_movie_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:] # finds the 5 most similar titles to our search term
    
    # Creates a dataframe with the rows that are given in the "indices" list
    results_df = movies.iloc[indices][::-1] 
    
    return results_df

In [52]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

list_of_movies = widgets.Output()

def on_type(data): # Will be called everytime something is typed in the input box
    with list_of_movies:
        list_of_movies.clear_output()
        title = data["new"] # Whatever is going to be entered in the input box will be a dictionary value with the key "new"
        if len(title) > 5:
            display(search(title))

# ties the on_type function to the movie_input widget, makes it observe the value of the dictionary
movie_input.observe(on_type, names='value') 

display(movie_input, list_of_movies)

Text(value='Toy Story', description='Movie Title:')

Output()

### Reading in movie ratings data ###

In [53]:
ratings = pd.read_csv("ratings.csv")

In [54]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [55]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Finding users who liked the same movie ###

In [56]:
movie_id = 1 # hardcoded this to find users who like "Toy Story", so similar_users will be based on that

In [57]:
# Users who have similar taste in movies (gave similar ratings to this particular movie with movieId = 1)
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.0)]["userId"].unique()

In [58]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [59]:
# Finding the other movies that these users liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.0)]["movieId"]

In [60]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [61]:
# I want to find the movies that greater than 10% of the users, who are similar in that they rated movie with movieId = 1 a rating > 4, liked
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]  
similar_user_recs

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

### Finding how much all users like movies ###

In [62]:
# create a dataframe showing the users who gave a rating > 5 to movies that are in similar_user_recs
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [63]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [64]:
# find the percentage of all users who recommeneded the movies in similar_user_recs
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [65]:
all_user_recs

movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

### Creating a recommendation score ###

In [66]:
# I want to compare the percentages
# Combine similar_user_recs and all_user_recs
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1) # axis=1 because each series will be a column
rec_percentages.columns = ["similar", "all"]

In [67]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [68]:
# The values in the "similar" column represent the percentage of users who are similar to us (who like movieId 1) who like the given movies.
# The values in the "all" column represent the percentage of all users who like the given movies.
"""
We want movies that have a big difference between the "similar" and "all" values.

Explanation:
If 100% of similar users like "Toy Story" and 100% of all users like "Toy Story", then "Toy Story" is not a good recommendation
for users who like "Avengers" since everyone likes "Toy Story". We want movies that have stats that show that people who liked "Avengers"
also liked "Thor", but not everyone liked "Thor". 

"""
print("") # I put this here since the multi-line comment was getting printed out




In [69]:
# find ratio between the percentages
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [70]:
# We want the largest recommendation percentage values to be at the top
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [71]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [72]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


### Building a recommendation function ###

In [73]:
# I will be using code that I wrote in previous cells for this function
def find_similar_movies(movie_id):
    # finding recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.0)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.0)]["movieId"]

    # adjusting data so that we only have data where over 10% of users that are similar to us recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1] 

    # finding how common the recommendations were among all of the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # creating the recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

### Creating an interactive recommendation widget ###

In [75]:
movie_input_title = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

list_of_movies = widgets.Output()

def on_type(data): # Will be called everytime something is typed in the input box
    with list_of_movies:
        list_of_movies.clear_output()
        title = data["new"] # Whatever is going to be entered in the input box will be a dictionary value with the key "new"
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"] # first row has the best/most similar result
            display(find_similar_movies(movie_id))

# ties the on_type function to the movie_input widget, makes it observe the value of the dictionary
movie_input_title.observe(on_type, names='value') 

display(movie_input_title, list_of_movies)

Text(value='Toy Story', description='Movie Title:')

Output()