In [35]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [36]:
movies = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movie/movies.csv")

In [41]:
# cleans extra characters (e.g., paranthesis, dash, etc.) in movie titles
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

# clean titles
movies["clean_title"] = movies["title"].apply(clean_title)

In [38]:
# create tfidf vectorizer with ngram
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [40]:
# outputs the most similar movies to the search title
def search(title):
  # clean search title
  title = clean_title(title)
  
  # turn search title into tfidf matrix
  query_vec = vectorizer.transform([title])
  
  # compute cosine similarity 
  similarity = cosine_similarity(query_vec, tfidf).flatten()

  # find the 5 most similar titles
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices].iloc[::-1]
  return results

In [None]:
search("Toy Story 1995")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [56]:
movies[movies["movieId"]== 260]

Unnamed: 0,movieId,title,genres,clean_title
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977


### Interactive Widget 

In [None]:
# create an input widget
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disable = False
)

In [None]:
# create an output widget
movie_list = widgets.Output()

def on_type(data):
    with movie_list:

        # clear output
        movie_list.clear_output()

        # retrieve title
        title = data["new"]
        if len(title) > 5:
            display(search(title))

# run 'on_type' function when the 'value' event is called
movie_input.observe(on_type, names='value')

# display output
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
ratings = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movie/ratings.csv")
ratings

### Similar Users

In [49]:
movie_id = 1

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

# find users who "liked" the same movie as the user
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

# find other movies these users liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

# show number of times each movie appears in the similar movies list
similar_user_recs.value_counts()

1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: movieId, Length: 19282, dtype: int64

The movie with ID 89745 appears 6036 times, the movie with ID 58559 appears 3461 times, etc. 

In [50]:
# take the movies that are liked by similar users with greater than a threshold
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

We have 193 movies that users who are similar to us liked above a certain threshold.

### All users

In [51]:
# find all users who's watched the movies that were recommended to us and rated them highly
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

# find percentage all users recommend these movies
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

all_user_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [52]:
# combine two percentages together
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
32,0.160711,0.100293
34,0.130555,0.052229
47,0.225909,0.144469
50,0.275604,0.200513
...,...,...
59315,0.104593,0.054269
60069,0.170640,0.076307
68954,0.159172,0.064944
78499,0.152960,0.035131


In [53]:
# create recommendation score
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# sort descending
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [None]:
# take top 10 recommendations & merge them with movies data to show movie titles along with movieIds
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010


In [42]:
def find_similar_movies(movie_id):
    # find similar users and their recommended movies
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # find movies where over 10% of similar users recommended 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # find how common the recommendations were among all the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # create recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [43]:
# create an input widget
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
# create an output widget
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            # search title
            results = search(title)
            # retrieve the first row
            movie_id = results.iloc[0]["movieId"]
            # display similar movies
            display(find_similar_movies(movie_id))

# observe the value
movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()