In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

In [2]:
# Load datasets
movies = pd.read_csv('movies.csv', sep=',')
ratings = pd.read_csv('ratings.csv', sep=',')

# Merge movies and ratings dataframes
data = pd.merge(ratings, movies, on='movieId')

# Check for missing values
#print(movies.isnull().sum())
#print(ratings.isnull().sum())

In [3]:
# Extract features from the movies' genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Calculate cosine similarity between movies
movie_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_similarity_df = pd.DataFrame(movie_similarity, index=movies['title'], columns=movies['title'])

# Display similarity matrix
movie_similarity_df.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.813578,0.152769,0.135135,0.267586,0.0,0.152769,0.654698,0.0,0.262413,...,0.360397,0.465621,0.196578,0.516225,0.0,0.680258,0.755891,0.0,0.421037,0.267586
Jumanji (1995),0.813578,1.0,0.0,0.0,0.0,0.0,0.0,0.804715,0.0,0.322542,...,0.0,0.0,0.0,0.0,0.0,0.341376,0.379331,0.0,0.0,0.0
Grumpier Old Men (1995),0.152769,0.0,1.0,0.884571,0.570915,0.0,1.0,0.0,0.0,0.0,...,0.162848,0.0,0.419413,0.0,0.0,0.181883,0.202105,0.0,0.0,0.570915
Waiting to Exhale (1995),0.135135,0.0,0.884571,1.0,0.505015,0.0,0.884571,0.0,0.0,0.0,...,0.144051,0.201391,0.68744,0.0,0.0,0.160888,0.178776,0.466405,0.0,0.505015
Father of the Bride Part II (1995),0.267586,0.0,0.570915,0.505015,1.0,0.0,0.570915,0.0,0.0,0.0,...,0.28524,0.0,0.734632,0.0,0.0,0.318581,0.354002,0.0,0.0,1.0


In [4]:
# Calculate average ratings for each movie
average_ratings = data.groupby('title')['rating'].mean()
average_ratings_df = pd.DataFrame(average_ratings)

# Display average ratings
average_ratings_df.head()


Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'71 (2014),4.0
'Hellboy': The Seeds of Creation (2004),4.0
'Round Midnight (1986),3.5
'Salem's Lot (2004),5.0
'Til There Was You (1997),4.0


In [19]:
# Function to get movie recommendations based on a given movie
def get_movie_recommendations(user_input, num_recommendations):
    # Find the best match for the user input
    best_match = process.extractOne(user_input, movie_similarity_df.columns, scorer=fuzz.partial_ratio)
    
    if best_match is None or best_match[1] < 60:  # Use a threshold to filter poor matches
        return "Movie not found."

    movie_title = best_match[0]
    
    # Get similar movies
    similar_movies = movie_similarity_df[movie_title].sort_values(ascending=False).index[1:]
    similar_scores = movie_similarity_df[movie_title].sort_values(ascending=False).values[1:]

    # Combine similarity scores with average ratings
    recommendations = []
    for movie, score in zip(similar_movies, similar_scores):
        if movie in average_ratings_df.index:
            avg_rating = average_ratings_df.loc[movie, 'rating']
            recommendations.append((movie, score, avg_rating))

    # Sort recommendations by a combined score (e.g., similarity + average rating)
    recommendations.sort(key=lambda x: (x[1], x[2]), reverse=True)

    # Return top N recommendations
    return movie_title, recommendations[:num_recommendations]

# Get user input for the movie title
user_movie = input("Enter the movie title you like: ")

def get_integer_input(prompt):
    while True:
        try:
            # Attempt to convert the input to an integer
            user_input = int(input(prompt))
            return user_input
        except ValueError:
            # If input is not a valid integer, prompt the user again
            print("Invalid input. Please use only numbers in this field.")

num_recommendations = get_integer_input("How many recommendations would you like? ")
result = get_movie_recommendations(user_movie, num_recommendations)

# Display recommendations
if isinstance(result, str):  # Check if the result is an error message
    print(result)
else:
    movie_title, recommendations = result
    print()
    print(f"Recommended Movies for '{movie_title}':")
    print()
    for movie, score, rating in recommendations:
        print(f"{movie:<70} Similarity: {score:<10.2f} Rating: {rating:.2f}")

# Provides a pause and option to exit when ready
input("Press Enter to exit...")


Enter the movie title you like:  Inception
How many recommendations would you like? 5



Recommended Movies for 'Inception (2010)':

Watchmen (2009)                                                        Similarity: 0.94       Rating: 3.99
Super 8 (2011)                                                         Similarity: 0.88       Rating: 3.60
RoboCop (2014)                                                         Similarity: 0.85       Rating: 2.33
Strange Days (1995)                                                    Similarity: 0.84       Rating: 3.25
V for Vendetta (2006)                                                  Similarity: 0.83       Rating: 3.88


Press Enter to exit... 


''