In [233]:
import pandas as pd

# https://files.grouplens.org/datasets/movielens/ml-25m.zip
movieData = pd.read_csv("movies.csv")
ratingData = pd.read_csv("ratings.csv")



In [234]:
# another way to clean the data is by dropping any duplicate values from the data frame
# will not do this on the actual dataset we are implementing our movie rec system on, but it is for future consideration

remove_dup = movieData.append(movieData.iloc[20:30,:])    # this line appends similar rows to that of the dataframe
remove_dup.duplicated().sum()                       # outputs number of duplicate values 


10

In [235]:
remove_dup = remove_dup.drop_duplicates()
remove_dup.duplicated().sum()

0

In [236]:
movieData.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [237]:
import re

def cleanTitle(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [238]:
movieData["cleanTitle"] = movieData["title"].apply(cleanTitle)

In [239]:
movieData

Unnamed: 0,movieId,title,genres,cleanTitle
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [240]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movieData["cleanTitle"])

In [241]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = cleanTitle(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movieData.iloc[indices].iloc[::-1]
    
    return results

In [242]:
def find_similar_movies(movie_id):
    similar_users = ratingData[(ratingData["movieId"] == movie_id) & (ratingData["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratingData[(ratingData["userId"].isin(similar_users)) & (ratingData["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratingData[(ratingData["movieId"].isin(similar_user_recs.index)) & (ratingData["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movieData, left_index=True, right_on="movieId")[["score", "title", "genres"]]

def userInput():
    # User input
    search_value = input("Movie Name: ")

    # Search for the value in the first column
    column_name = 'title'
    mask = movieData[column_name] == search_value
    result = movieData.loc[mask]

    # Retrieve the value of the second column for each matching row
    column_name2 = 'movieId'
    movie_id = result[column_name2].values[0]
    return movie_id




In [243]:
movie_id = userInput()
# Find movie recommendations given the movie you inputted
find_similar_movies(movie_id)

Unnamed: 0,score,title,genres
1,57.008249,Jumanji (1995),Adventure|Children|Fantasy
156,18.757121,Casper (1995),Adventure|Children
313,14.88039,"Santa Clause, The (1994)",Comedy|Drama|Fantasy
578,9.382034,Home Alone (1990),Children|Comedy
495,8.71198,Mrs. Doubtfire (1993),Comedy|Drama
362,8.666058,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
2526,7.959267,"Mummy, The (1999)",Action|Adventure|Comedy|Fantasy|Horror|Thriller
721,7.539414,Twister (1996),Action|Adventure|Romance|Thriller
579,6.375923,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller
312,6.231007,Stargate (1994),Action|Adventure|Sci-Fi
