In [14]:
# import the pandas library for dataframe use
import pandas as pd

# the link that can be used to see datasets movies.csv and ratings.csv once downloaded and extracted from the zip file
# https://files.grouplens.org/datasets/movielens/ml-25m.zip 


# loads the data of the csv files into a dataframe
movieData = pd.read_csv("movies.csv")
ratingData = pd.read_csv("ratings.csv")



In [15]:
# another way to clean the data is by dropping any duplicate values from the data frame
# will not do this on the actual dataset we are implementing our movie rec system on, but it is for future consideration

remove_dup = movieData.append(movieData.iloc[20:30,:])    # this line appends similar rows to that of the dataframe
remove_dup.duplicated().sum()                       # outputs number of duplicate values 


10

In [16]:
# this will remove any duplicate values from the data frame
remove_dup = remove_dup.drop_duplicates()
remove_dup.duplicated().sum()

0

In [17]:
# returns the first n rows of the dataframe -- default is 5 rows
movieData.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
import re

# cleans the title string of any character other than a letter (lowercase or uppercase), space, or number 
def cleanTitle(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [19]:
# adding a cleanTitle column to display the cleanTitle() function in action
movieData["cleanTitle"] = movieData["title"].apply(cleanTitle)

In [20]:
# The cleanTitle() function took out the parentheses from the original title strings
movieData

Unnamed: 0,movieId,title,genres,cleanTitle
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [21]:
# creates the tfidf matrix
# computers can't understand characters, they understand numbers
# converting movie titles into set of numbers so that our machines can search and find them
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))     # initialize the class
# ngram argument will find groups of two words that are consecutive- makes search accurate


# turn set of titles which are now numbers into a matrix
tfidf = vectorizer.fit_transform(movieData["cleanTitle"])

In [22]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# compute the similarity between a term we enter and all the movies in the list
def search(title):
    title = cleanTitle(title)   # the term is the title we search for - we clean the term by using cleanTitle function
    query_vec = vectorizer.transform([title])   # use the vectorizer to turn the search term we entered into a set of numbers
    similarity = cosine_similarity(query_vec, tfidf).flatten()  # compare our query term to each of the clean movie titles we have in the dataset, returns how similar the title is 
    indices = np.argpartition(similarity, -5)[-5:] # five most similar search terms
    results = movieData.iloc[indices].iloc[::-1]    # returns the titles at those indices
    
    return results  

In [23]:
def find_similar_movies(movie_id):
    similar_users = ratingData[(ratingData["movieId"] == movie_id) & (ratingData["rating"] > 4)]["userId"].unique() # find anyone who likes the movie you inputted and gave it a rating > 4 -- similar tastes to you -- only taking unique userId
    similar_user_recs = ratingData[(ratingData["userId"].isin(similar_users)) & (ratingData["rating"] > 4)]["movieId"]  # find the other movies that these similar_users liked 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)   # find what percentage of users recommend the movie

    similar_user_recs = similar_user_recs[similar_user_recs > .10]  # only select the ones greater than 10 percent liked
    all_users = ratingData[(ratingData["movieId"].isin(similar_user_recs.index)) & (ratingData["rating"] > 4)]  
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) # what percentage of all_users (outside of our set of people similar to us) like this movie
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]  # create a score (dividing similar by all)
    rec_percentages = rec_percentages.sort_values("score", ascending=False) # sort the recommendations
    return rec_percentages.head(10).merge(movieData, left_index=True, right_on="movieId")[["score", "title", "genres"]]     # make list pretty

def userInput():
    # user input
    search_value = input("Movie Name: ")

    # search for the value in the first column
    column_name = 'title'
    mask = movieData[column_name] == search_value
    result = movieData.loc[mask]

    # retrieve the value of the second column for each matching row
    column_name2 = 'movieId'
    movie_id = result[column_name2].values[0]
    return movie_id




In [24]:
movie_id = userInput()
# Find movie recommendations given the movie you inputted
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
0,8.017414,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.225654,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.405452,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,4.354038,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
4780,3.320783,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
580,3.208539,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
6258,3.156862,Finding Nemo (2003),Adventure|Animation|Children|Comedy
587,2.99115,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
8246,2.972889,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
359,2.954762,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
