In [2]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movies.csv data
movies_df = pd.read_csv(r"C:\Users\karrt\Documents\MLProjects\Movie_recommendation\movies.csv")

# Load ratings.csv data
ratings_df = pd.read_csv(r"C:\Users\karrt\Documents\MLProjects\Movie_recommendation\ratings.csv")

In [3]:
# Remove rows where genre is "(no genres listed)"
movies_df = movies_df[movies_df['genres'] != "(no genres listed)"]

# Verify the removal by printing the shape of the cleaned DataFrame
print("Shape of cleaned DataFrame:", movies_df.shape)

Shape of cleaned DataFrame: (57361, 3)


In [4]:
# Ask for user ID
user_id = int(input("Enter your user ID: "))

# Filter ratings for the user and sort by timestamp in descending order
user_ratings = ratings_df[ratings_df['userId'] == user_id].sort_values(by='timestamp', ascending=False)

# Get the recent 10 movies watched by the user
recent_movies = user_ratings.head(10)

In [5]:
# Get the genres of the recent movies
recent_movie_ids = recent_movies['movieId'].tolist()
recent_movie_genres = movies_df[movies_df['movieId'].isin(recent_movie_ids)]['genres']
print(recent_movie_genres)

# # Count the occurrences of each genre in the recent movies
# genre_counts = ' '.join(recent_movie_genres).split('|')
# print(genre_counts)
# genre_counts = pd.Series(genre_counts).value_counts().head(5)
# print(genre_counts)
# Count the occurrences of each genre in the recent movies
genre_counts = recent_movie_genres.str.split('|').explode().value_counts().head(5)
print(genre_counts)

# # Get the top 5 most watched genres from the recent movies (without duplicates)
top_genres = genre_counts.index.unique().tolist()[:5]

seen = set()
top_genres_unique = [x for x in top_genres if not (x in seen or seen.add(x))]

211                                             Drama
287                                       Crime|Drama
522                                         Drama|War
536                            Action|Sci-Fi|Thriller
582                           Adventure|Drama|Western
972                                      Comedy|Drama
1164                                            Drama
1329                                            Drama
1363                                            Drama
1422    Crime|Drama|Fantasy|Film-Noir|Mystery|Romance
Name: genres, dtype: object
Drama     9
Crime     2
War       1
Action    1
Sci-Fi    1
Name: genres, dtype: int64


In [6]:
print(top_genres_unique)
print(recent_movie_genres)

['Drama', 'Crime', 'War', 'Action', 'Sci-Fi']
211                                             Drama
287                                       Crime|Drama
522                                         Drama|War
536                            Action|Sci-Fi|Thriller
582                           Adventure|Drama|Western
972                                      Comedy|Drama
1164                                            Drama
1329                                            Drama
1363                                            Drama
1422    Crime|Drama|Fantasy|Film-Noir|Mystery|Romance
Name: genres, dtype: object


In [7]:
# Filter movies by genres
genre_movies = movies_df[movies_df['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

# Convert genres into a single string
genre_string = ' '.join(genre_movies['genres'])

In [8]:
print(len(genre_movies))

34220


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
print(tfidf)

# Fit and transform the genres column
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

TfidfVectorizer(stop_words='english')


In [10]:
# Recommend movies based on the closest genres
def recommend_movies_based_on_genres(closest_genres, df, tfidf_vectorizer, top_n=10):
    # Combine closest genres into a single string
    combined_genres = ' '.join(closest_genres)
    # Transform combined genres into TF-IDF vector
    input_tfidf = tfidf_vectorizer.transform([combined_genres])
    # Compute cosine similarity between combined genres and all movies
    cosine_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    print(cosine_scores)
    # Get indices of top n movies with highest cosine similarity
    top_indices = cosine_scores.argsort()[-top_n:][::-1]
    # Retrieve movie titles based on top indices
    recommended_movies = df.iloc[top_indices]['title']
    
    return recommended_movies


In [11]:
recommended_movies = recommend_movies_based_on_genres(top_genres, movies_df, tfidf)

[0.         0.         0.         ... 0.         0.13426906 0.29257331]


In [12]:
print("For user ID:",user_id,"recommended movies are:")
print(recommended_movies)







# # Use CountVectorizer to create a genre matrix
# vectorizer = CountVectorizer()
# genre_matrix = vectorizer.fit_transform([genre_string])

# # Compute cosine similarity between genre matrix and each movie
# cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

# # Get indices of top 10 movie recommendations based on cosine similarity
# top_indices = cosine_sim[0].argsort()[-11:-1][::-1]

# # Print top 10 movie recommendations
# print("\nTop 10 movie recommendations based on recent genres:")
# for idx in top_indices:
#     print("Movie ID:", genre_movies.iloc[idx]['movieId'], "| Title:", genre_movies.iloc[idx]['title'])


For user ID: 100 recommended movies are:
29750    Flight World War II (2015)
17842          Zone Troopers (1985)
44769           G.I. Samurai (1979)
2231                 Soldier (1998)
16125    Battle: Los Angeles (2011)
22553         Whip Hand, The (1951)
30121               Criminal (2016)
31447            Tokyo Tribe (2014)
61195         Men Must Fight (1933)
45950                The Fog (2010)
Name: title, dtype: object
