In [11]:
import pandas as pd
import re

movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [12]:
# create function to clean the title, removes any characters that are not letters, digits, or spaces
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


### Term Frequency Matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instead of looking at individual words in the title, will also look at groups of two consecutive words to make search more accurate
vectorizer = TfidfVectorizer(ngram_range = (1, 2))

# turn our set of titles into a matrix
tfidf = vectorizer.fit_transform(movies["clean_title"])

### Search Function

In [92]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    # clean the title entered
    title = clean_title(title)

    # use vectorizer to transform the title into a set of numbers
    query_vector = vectorizer.transform([title])

    # compare query term to each of the titles in dataset and return how similar the searched title is to each of those titles
    similarity = cosine_similarity(query_vector, tfidf).flatten()

    # if searched title is invalid, return message
    if max(similarity) < 0.05:
        return "Movie not found. Try typing the whole title or searching for another movie."
    
    # find the titles that have the greatest similarity to the searched title
    indices = np.argpartition(similarity, -10)[-10:] # find the 10 most similar titles to searched title
    # index the movies dataset by those indices, reverse the results (as most similar result is last in list)
    return movies.iloc[indices][::-1][["movieId", "title", "genres"]]

### Interactive Search Widget

In [93]:
import ipywidgets as widgets
from IPython.display import display

# create an input text widget
movie_input = widgets.Text(
    value = "", # default search value
    description = "Movie Title:"
)

# create an output widget
movie_output = widgets.Output()

def type_in(data):
    with movie_output:
        movie_output.clear_output()
        title = data["new"] # grab title from input (dict)
        if len(title) > 1:
            display(search(title)) # search set of titles for the title and display it into the output widget

# hook up input widget to output widget
movie_input.observe(type_in, names = "value")
display(movie_input, movie_output) # show both widgets 

Text(value='', description='Movie Title:')

Output()

### Finding Similar Movies based on Ratings

In [35]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [36]:
# view data types
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [37]:
def recommend_movies(movie_id):
    # find users who also liked the same movie
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    
    # find other movies that similar users liked
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    # find only the movies that at least 10% of the similar users liked
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs >= .10]
    
    # find how much all users in dataset like the movies
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    # find the percentage of all users that like each of those movies
    all_users_recs = all_users['movieId'].value_counts() / len(all_users["userId"].unique())
    
    # compare the percentages of all user recs and similar user recs
    recs = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    recs.columns = ["similar", "all"]

    # calculate a score using the ratio between the two percentages (higher score means stronger recommendation)
    recs["score"] = recs["similar"] / recs["all"]
    recs = recs.sort_values("score", ascending = False) # sort these recommendations

    # take top 25 recommendations and merge it with movie data, display only score, title, and genres
    return recs.head(25).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

### Interactive Recommendation Widget

In [None]:
# create an input widget
movie_title_input = widgets.Text(
    value = "",
    description = "Movie Title:",
    placeholder = "Enter a movie title...",
)

# create output widget
rec_list = widgets.Output()

def type_in(data):
    with rec_list:
        rec_list.clear_output()
        title = data["new"]
        
        if len(title) > 1:
            results = search(title)

            if isinstance(results, str): # if no matches found, display message
                display(results)
            else:
                movie_id = results.iloc[0]["movieId"] # grab movie id of top recommendations
                display(recommend_movies(movie_id)) # display dataframe of similar movies

# hook up input widget to output widget
movie_title_input.observe(type_in, names = "value")
display(movie_title_input, rec_list)

Text(value='', description='Movie Title:', placeholder='Enter a movie title...')

Output()