In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
movies = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movie_yt/movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [6]:
ratings = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/movie_yt/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [8]:
# cleans extra characters (e.g., paranthesis, dash, etc.) in movie titles
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

# clean titles
movies["clean_title"] = movies["title"].apply(clean_title)

In [9]:
# create tfidf vectorizer with ngram
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [26]:
# outputs the most similar movies to the search title
def search(title):
    # clean search title
    title = clean_title(title)
    # turn search title into tfidf matrix
    query_vec = vectorizer.transform([title])
    # compute cosine similarity 
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # find the 5 most similar titles
    indices = np.argpartition(similarity, -1)[-1:]
    results = movies.iloc[indices].iloc[::-1]
    return results

In [27]:
search("Toy Story 1995")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


In [21]:
movies[movies["movieId"]== 1]

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995


### Similar Users

In [14]:
movie_id = 1

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

# find users who "liked" the same movie as the user
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

# find other movies these users liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

# show number of times each movie appears in the similar movies list
similar_user_recs.value_counts()

1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: movieId, Length: 19282, dtype: int64

The movie with ID 89745 appears 6036 times, the movie with ID 58559 appears 3461 times, etc. 

In [15]:
# take the movies that are liked by similar users with greater than a threshold
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

We have 193 movies that users who are similar to us liked above a certain threshold.

### All users

In [16]:
# find all users who's watched the movies that were recommended to us and rated them highly
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

# find percentage all users recommend these movies
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

all_user_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [17]:
# combine two percentages together
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
32,0.160711,0.100293
34,0.130555,0.052229
47,0.225909,0.144469
50,0.275604,0.200513
...,...,...
59315,0.104593,0.054269
60069,0.170640,0.076307
68954,0.159172,0.064944
78499,0.152960,0.035131


In [18]:
# create a recommendation score
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# sort descending
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


## Evaluating our Recommender System

In [19]:
# take top 10 recommendations & merge them with movies data to show movie titles along with movieIds
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [47]:
def find_similar_movies(movie_id):
    # find similar users and their recommended movies
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # find movies where over 10% of similar users recommended 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # find how common the recommendations were among all the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # create recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

### Example: Marvel Cinematic Universe

In [45]:
search("The Avengers 2012")

Unnamed: 0,movieId,title,genres,clean_title
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012


In [48]:
movie_id = 89745
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
17067,24.716368,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
20513,19.610199,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX
25058,19.49177,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi
19678,17.867419,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX
16725,17.843074,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War
16312,17.299824,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX
21348,17.183667,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX
25071,16.649399,Captain America: Civil War (2016),Action|Sci-Fi|Thriller
25061,15.865628,Ant-Man (2015),Action|Adventure|Sci-Fi
14628,15.651921,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX


### Example: American Drama

In [49]:
search("Moonlight 2016")

Unnamed: 0,movieId,title,genres,clean_title
41769,162414,Moonlight,Drama,Moonlight


In [50]:
movie_id = 162414
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
41769,200.064474,Moonlight,Drama
48904,72.682599,The Florida Project (2017),Drama
55938,69.773375,Roma (2018),Drama
43222,62.173883,Manchester by the Sea (2016),Drama
44566,60.646722,Call Me by Your Name (2017),Drama|Romance
51758,48.471678,The Favourite,Drama
48887,44.145682,Lady Bird (2017),Comedy
18487,42.355992,"Master, The (2012)",Drama
40153,41.454801,The Handmaiden (2016),Drama|Romance|Thriller
51773,40.287583,Hereditary (2018),(no genres listed)


### Example: Animation

In [51]:
search("Finding Nemo 2003")

Unnamed: 0,movieId,title,genres,clean_title
6258,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003


In [52]:
movie_id = 6377
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
6258,13.957482,Finding Nemo (2003),Adventure|Animation|Children|Comedy
4780,6.526457,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
5110,6.300361,Ice Age (2002),Adventure|Animation|Children|Comedy
7734,6.231726,Shrek 2 (2004),Adventure|Animation|Children|Comedy|Musical|Ro...
11361,5.74275,Ratatouille (2007),Animation|Children|Drama
8246,5.703435,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
14813,5.433025,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
1818,5.319992,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...
7923,4.915663,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
2264,4.637759,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy


### Example: Horror

In [53]:
search("Halloween 1978")

Unnamed: 0,movieId,title,genres,clean_title
1893,1982,Halloween (1978),Horror,Halloween 1978


In [54]:
movie_id = 1982
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
1893,84.811914,Halloween (1978),Horror
1885,44.128026,Friday the 13th (1980),Horror|Mystery|Thriller
2368,36.428522,"Texas Chainsaw Massacre, The (1974)",Horror
1312,31.295663,"Nightmare on Elm Street, A (1984)",Horror|Thriller
1315,24.119373,"Omen, The (1976)",Horror|Mystery|Thriller
1310,23.985966,Carrie (1976),Drama|Fantasy|Horror|Thriller
7262,21.456689,Dawn of the Dead (1978),Action|Drama|Horror
947,20.334086,Night of the Living Dead (1968),Horror|Sci-Fi|Thriller
1905,19.922605,Poltergeist (1982),Horror|Thriller
4001,19.801901,"Evil Dead, The (1981)",Fantasy|Horror|Thriller
