In [1]:
import pandas as pd

movies = pd.read_csv("ml-25m/movies.csv")
print(movies)

import re 
def clean_title(title):  #need to clean results to get rid of any extra characters such as parenthesis etc 

    return re.sub("[^a-zA-Z0-9 ]", "", title) # removes everything that isnt a lowercase/upper case or number/space


movies["clean_title"] = movies["title"].apply(clean_title)
print(movies)


from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # instead of individually searching for a word, searches for pairs.

tfidf = vectorizer.fit_transform(movies["clean_title"])

#to compute the similarity between two titles  

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    #find similarity with all titles in data and search term
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    #titles that have the greatest similarity to our search term 
    indices  = np.argpartition(similarity, -5)[-5:]

    results = movies.iloc[indices][::-1]
    return results

import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value= "Toy Story",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names= 'value')

display(movie_input, movie_list)

    



       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

Text(value='Toy Story', description='Movie Title:')

Output()

In [2]:

#finding users who liked the same movie

ratings = pd.read_csv("ml-25m/ratings.csv")

ratings.dtypes

movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5 )]["userId"].unique()


#finds users that liked the same movie as search input
similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
similar_users_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [3]:
similar_users_recs= similar_users_recs.value_counts() / len(similar_users)

similar_users_recs= similar_users_recs[similar_users_recs > .1]

similar_users_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

In [4]:
#how much do all users in our dataset like these movies.

all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] >4)]
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

all_users_recs


318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

In [5]:
rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195
