In [1]:
import pandas as pd
movies = pd.read_csv("movies/movies.csv")

# print(movies)

import re


def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)


movies["clean_title"] = movies["title"].apply(clean_title)

# print(movies)

# build a tfidf matrix
# ml library for making search efficient
from sklearn.feature_extraction.text import TfidfVectorizer

# if we search for Toy Story it will look for Toy Story and Story 1995
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# set of titles into matrix
tfidf = vectorizer.fit_transform(movies["clean_title"])
# computes similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# creating a search function
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]  # gives 5 most similar search terms
    results = movies.iloc[indices][::-1]
    return results


# print(results)

# building interactive search box
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title",
    disabled=False
)
print(movie_input)
movie_list=widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(search(title))
movie_input.observe(on_type, names='value')
display(movie_input,movie_list)
#reading in movie data ratings
ratings=pd.read_csv("movies/ratings.csv")
#finding users who liked the same movie
movie_id=1
similar_users=ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>=5)]["userId"].unique()
similar_user_recs=ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
# similar_user_recs
# counts how many times each movie occurs in our particular dataset
similar_user_recs=similar_user_recs.value_counts() / len(similar_users)
similar_user_recs=similar_user_recs[similar_user_recs>.1]
# similar_user_recs
#all users who watched movies recommended to them
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) &(ratings["rating"]>4)]
# all_users
#find what percentage of all users recommended this movie
all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
# all_users_recs
#creating recommendation system
rec_percentages = pd.concat([similar_user_recs, all_users_recs] , axis=1)
rec_percentages.columns=["similar", "all"]
# rec_percentages
# find ratio bw percentages
rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]
rec_percentages=rec_percentages.sort_values("score", ascending=False)
# rec_percentages
rec_percentages.head(10).merge(movies,left_index=True, right_on="movieId")
def find_similar_movies(movieId):
    similar_users=ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>=5)]["userId"].unique()
    similar_user_recs=ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    # similar_user_recs
    # counts how many times each movie occurs in our particular dataset
    similar_user_recs=similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs=similar_user_recs[similar_user_recs>.10]
    # similar_user_recs
    #all users who watched movies recommended to them
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) &(ratings["rating"]>4)]
    # all_users
    #find what percentage of all users recommended this movie
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    # all_users_recs
    rec_percentages = pd.concat([similar_user_recs, all_users_recs] , axis=1)
    rec_percentages.columns=["similar", "all"]
    # rec_percentages
    # find ratio bw percentages
    rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]
    rec_percentages=rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies,left_index=True, right_on="movieId")[["score","title","genres"]]
    
#create a widget
import ipywidgets as widgets
from IPython.display import display

movie_name_input=widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
recommendation_list=widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title=data["new"]
        if len(title>5):
            results = search(title)
            movie_id=results.iloc[0]["movie_Id"]
            display(find_similar_movies(movie_id))
movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)

ModuleNotFoundError: No module named 'sklearn'