In [7]:
# https://files.grouplens.org/datasets/movielens/ml-25m.zip

In [17]:
import pandas as pd
movies = pd.read_csv("movies.csv")

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [19]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title) #cleaning and removing all titles with special characters don't really understand
    return title

movies["clean_title"] = movies["title"].apply(clean_title) #mapping each title in data frame to clean title function

movies


Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [21]:
#turned each thing title into a vector by using vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #gives the language model more context by using bigrams for language processing

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [37]:
#now, turning this into a function

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title) #first cleaning up the title for special characters
    query_vec = vectorizer.transform([title]) #transform into a vector
    similarity = cosine_similarity(query_vec, tfidf).flatten() #comparing similarity between the title with all the titles
    indices = np.argpartition(similarity, -5)[-5:] #picking top 5 
    search_results = movies.iloc[indices].iloc[::-1]
    
    return search_results



In [54]:
#Visualize the search results by creating a widget

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description = "Movie Title",
    disabled = False
    

)

#calls the search function
movie_list = widgets.Output()

def on_type(data):
     with movie_list:
            movie_list.clear_output();
            title = data["new"]
            if len(title)>= 5:
                display(search(title)); 

            
movie_input.observe(on_type, names='value')


display(movie_input, movie_list)
            

Text(value='', description='Movie Title')

Output()

In [6]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")

In [12]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [48]:
#find users who have liked the same movie as what is searched 

movieId = 0

similar_users = ratings[(ratings["movieId"] == movieId) & (ratings["rating"] > 4)]["userId"].unique()

In [47]:
# finding movie recs that more then 10% or more users similar to us also liked

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_user_recs #generated all the similar user recomendations

Series([], Name: movieId, dtype: int64)

In [23]:
#movie recs that users similar to us also liked (# of time liked liked)
similar_user_recs.value_counts()

count
0.000053    5890
0.000106    2252
0.000159    1257
0.000212     872
0.000265     643
            ... 
0.044279       1
0.043642       1
0.018158       1
0.043589       1
1.000000       1
Name: count, Length: 947, dtype: int64

In [37]:
#movie recs that 50% or more users similar to us also liked

similar_user_recs = similar_user_recs.value_counts() / len(similar_users) #number of times liked/total number of similar users

similar_user_recs = similar_user_recs[similar_user_recs > .5]

In [45]:
similar_user_recs

Series([], Name: count, dtype: float64)