In [1]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movies.csv data
movies_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\movies.csv")

# Load ratings.csv data
ratings_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\ratings.csv")


In [2]:
# Ask for user ID
user_id = int(input("Enter your user ID: "))

# Filter ratings for the user
user_ratings = ratings_df[ratings_df['userId'] == user_id]

# Sort ratings by rating value (descending) and timestamp (descending)
user_ratings_sorted = user_ratings.sort_values(by=['rating', 'timestamp'], ascending=[False, False])

In [3]:
print(user_ratings_sorted.head())

       userId  movieId  rating  timestamp
13280     100     1193     5.0  862169731
13293     100     1466     5.0  862169186
13285     100     1358     5.0  862169132
13283     100     1354     5.0  862169101
13261     100      714     5.0  862169065


In [4]:
# Select top 10 highest rated movies by the user
top_rated_movies = []
for rating_value in sorted(user_ratings['rating'].unique(), reverse=True):
    movies_with_rating = user_ratings_sorted[user_ratings_sorted['rating'] == rating_value]
    top_rated_movies.extend(movies_with_rating.head(10 - len(top_rated_movies)).index)

# Extract genres of selected movies
selected_genres = movies_df.loc[top_rated_movies, 'genres']

In [5]:
print(selected_genres)

13280                         Western
13293             Action|Comedy|Crime
13285    Action|Adventure|Sci-Fi|IMAX
13283            Crime|Drama|Thriller
13261                          Comedy
13282                     Documentary
13268                    Comedy|Drama
13247                           Drama
13276                    Comedy|Drama
13249                Action|Drama|War
Name: genres, dtype: object


In [6]:

# Count occurrences of each genre
genre_counts = selected_genres.str.split('|').explode().value_counts().head(5)
print(genre_counts)

# Get the top 5 most watched genres
top_genres = genre_counts.index.tolist()
print(top_genres)

Drama      5
Comedy     4
Action     3
Crime      2
Western    1
Name: genres, dtype: int64
['Drama', 'Comedy', 'Action', 'Crime', 'Western']


In [7]:

# Filter movies by selected genres
genre_movies = movies_df[movies_df['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

# Convert genres into a single string
genre_string = ' '.join(genre_movies['genres'])


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
print(tfidf)

# Fit and transform the genres column
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

TfidfVectorizer(stop_words='english')


In [9]:
# Recommend movies based on the closest genres
def recommend_movies_based_on_genres(closest_genres, df, tfidf_vectorizer, top_n=10):
    # Combine closest genres into a single string
    combined_genres = ' '.join(closest_genres)
    # Transform combined genres into TF-IDF vector
    input_tfidf = tfidf_vectorizer.transform([combined_genres])
    # Compute cosine similarity between combined genres and all movies
    cosine_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    print(cosine_scores)
    # Get indices of top n movies with highest cosine similarity
    top_indices = cosine_scores.argsort()[-top_n:][::-1]
    # Retrieve movie titles based on top indices
    recommended_movies = df.iloc[top_indices]['title']
    
    return recommended_movies

In [10]:
recommended_movies = recommend_movies_based_on_genres(top_genres, movies_df, tfidf)

[0.08742898 0.         0.18840437 ... 0.40696759 0.         0.35117956]


In [11]:
print("For user ID:",user_id,", recommended movies are:")
print(recommended_movies)


For user ID: 100 , recommended movies are:
10594                             Bandidas (2006)
52664    A Minute To Pray, A Second To Die (1968)
11811                         3:10 to Yuma (2007)
42133                       Devil's Canyon (1953)
10888                       Mad Dog Morgan (1976)
24807                         Outrage, The (1964)
31006                     The Female Bunch (1971)
7274                             Ned Kelly (1970)
51328                          Boss Nigger (1975)
42289         The Legend of Nigger Charley (1972)
Name: title, dtype: object
