In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import warnings
from textblob import TextBlob
from wordcloud import WordCloud
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
movies=pd.read_csv('./data/movies.csv', index_col=0)
movies.head()

Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1
3,Ant-Man,Armed with the astonishing ability to shrink i...,en,13611,7.1
4,Percy Jackson: Sea of Monsters,"In their quest to confront the ultimate evil, ...",en,3542,5.9


In [9]:
movies.isnull().sum()

title                 0
overview             30
original_language     0
vote_count            0
vote_average          0
dtype: int64

In [10]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              10000 non-null  object 
 1   overview           9970 non-null   object 
 2   original_language  10000 non-null  object 
 3   vote_count         10000 non-null  int64  
 4   vote_average       10000 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 468.8+ KB


In [15]:
movies.dropna(inplace=True)

In [16]:
movies.isnull().sum()

title                0
overview             0
original_language    0
vote_count           0
vote_average         0
dtype: int64

In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9970 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              9970 non-null   object 
 1   overview           9970 non-null   object 
 2   original_language  9970 non-null   object 
 3   vote_count         9970 non-null   int64  
 4   vote_average       9970 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 467.3+ KB


In [None]:
# Content-Based Filtering (using movie overviews)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_data['overview'].fillna(''))
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Collaborative Filtering (using movie-item ratings)
item_item_matrix = movie_data.pivot(index='movie_id', columns='movie_id', values='rating')
cosine_sim_collaborative = cosine_similarity(item_item_matrix.fillna(0))

In [None]:
# Content-Enhanced Recommender Function
def content_enhanced_recommender(movie_title, movie_data, cosine_sim_content, cosine_sim_collaborative):
    target_movie_index = movie_data.index[movie_data['title'] == movie_title].tolist()[0]
    
    # Get content-based similarity scores for the target movie
    content_scores = cosine_sim_content[target_movie_index]

    # Get collaborative similarity scores for movies similar to the target movie based on ratings
    collaborative_scores = cosine_sim_collaborative[target_movie_index]

    # Combine content-based and collaborative scores
    hybrid_scores = 0.6 * content_scores + 0.4 * collaborative_scores

    # Sort movies based on hybrid scores and get top recommendations
    recommendations = movie_data['title'].iloc[hybrid_scores.argsort()[::-1]]
    return recommendations

In [None]:
# Example target movie (you should replace this with the actual movie title)
target_movie_title = 'The Dark Knight'
recommended_movies = content_enhanced_recommender(target_movie_title, movie_data, cosine_sim_content, cosine_sim_collaborative)
print(recommended_movies)