In [26]:
#importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances

In [27]:
#loading the dataset
movies = pd.read_csv(r'dataset/movies.csv')
ratings = pd.read_csv(r'dataset/ratings.csv')

In [28]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [29]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3908657 entries, 0 to 3908656
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
 3   tstamp   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 119.3+ MB


In [30]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [31]:
ratings.head()

Unnamed: 0,userId,movieId,rating,tstamp
0,206,4803,4.0,2003-04-07 13:52:01
1,5073,72731,4.0,2020-02-19 16:07:53
2,4739,91653,4.0,2020-12-28 15:35:58
3,535,3005,3.0,2008-12-26 05:38:11
4,465,4776,3.0,2008-08-13 20:22:36


In [32]:
# Merge movie and rating data
movie_ratings = pd.merge(ratings, movies, on='movieId')

In [33]:
movie_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3756004 entries, 0 to 3756003
Data columns (total 6 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
 3   tstamp   object 
 4   title    object 
 5   genres   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 200.6+ MB


In [34]:
movie_ratings=movie_ratings.head(1000000)

In [35]:
# Create a user-movie rating matrix
user_movie_ratings = movie_ratings.pivot_table(index='userId', columns='title', values='rating')

In [36]:
# Fill missing values with 0
user_movie_ratings = user_movie_ratings.fillna(0)

In [37]:
# Transpose the matrix to get movie-user ratings
movie_user_ratings = user_movie_ratings.T

In [38]:
# Calculate similarity between movies using cosine similarity
movie_similarity = cosine_similarity(movie_user_ratings)

In [39]:
# Create a DataFrame from the similarity matrix
movie_similarity_df = pd.DataFrame(movie_similarity, index=user_movie_ratings.columns, columns=user_movie_ratings.columns)

# Function to recommend movies for a given movie title
def recommend_movies(movie_title, num_recommendations=5):
    similar_scores = movie_similarity_df[movie_title]
    similar_movies = list(similar_scores.index)
    similar_movies.remove(movie_title)
    top_similar_movies = similar_scores.sort_values(ascending=False).head(num_recommendations)
    return top_similar_movies

In [41]:
recommend_movies("Toy Story (1995)")

title
Toy Story (1995)           1.000000
Toy Story 2 (1999)         0.812507
Finding Nemo (2003)        0.782345
Monsters, Inc. (2001)      0.780289
Incredibles, The (2004)    0.768641
Name: Toy Story (1995), dtype: float64