<a href="https://colab.research.google.com/github/intheblueside/machine_learning_projects/blob/main/movie_rec/movie_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


In [None]:
url = 'https://raw.githubusercontent.com/intheblueside/movie-rec/main/movies.csv?token=GHSAT0AAAAAACQIBM4HZ5MB46TUBLG6B6GSZQFVQHQ'
movies = pd.read_csv(url)

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
import re

#function to clean dataset, using regex
def clean_title(title):
  title = re.sub("[^a-zA-Z0-9 ]", "", title )
  return title

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


Building the model
- all data is saved as binary terms (0/1)
- inverse document frequency
- Tf * Idf
- finding similarity by the matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# checks title together (term frequency inverse document frequency)
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title]) #transform title into vectors
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:] #5 most similar into a index
  results = movies.iloc[indices][::-1] #reverse results
  return results

In [None]:
# gives top based on num set = indices
# vector of similar titles with search terms = similarity
# query_vec

In [None]:
# create widgets
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text ( # creates a iniput box
    value="Toy Story",
    description ="Movie Title: ",
    disabled=False
)
movie_list = widgets.Output()

def on_type(data): # function is called when data is entered in input box
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

In [None]:
# find movie similar to user preference
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/datasets/ratings.csv')

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [None]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [None]:
# find users who like similar movies, and what are the other movies the users like

In [None]:
movie_id = 1

In [None]:
# users who have watched movie_id, and gave ratings more than 5, and different users -no repeat
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)] ["userId"].unique()

In [None]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [None]:
similar_user_recs.value_counts()

1         13506
318        5599
260        5464
356        4690
296        4628
          ...  
27306         1
71732         1
4739          1
190187        1
97957         1
Name: movieId, Length: 16797, dtype: int64

In [None]:
#find movies greater than 10%
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [None]:
similar_user_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
# find what % all users rec
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
all_users_recs

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

In [None]:
rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percent.columns = ["similar", "all"]

In [None]:
rec_percent

Unnamed: 0,similar,all
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [None]:
rec_percent["score"] = rec_percent["similar"] / rec_percent["all"]

In [None]:
rec_percent = rec_percent.sort_values("score", ascending=False)

In [None]:
rec_percent #higher score, better rec

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [None]:
rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


In [None]:
def find_similar_movies(movie_id):
  #finding rec from similar users
  similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)] ["userId"].unique()
  similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

  #adjust so rec only 10% > usrs
  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .1]

  #common rec among all users
  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
  all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percent = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percent.columns = ["similar", "all"]

  rec_percent["score"] = rec_percent["similar"] / rec_percent["all"]

  rec_percent = rec_percent.sort_values("score", ascending=False)
  #only show 3 columns
  return rec_percent.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



In [None]:
# widget
movie_name_input = widgets.Text(
    value="Toy Story",
    description="Movie Title: ",
    disabled=False
)

#output
recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:

    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()