In [1]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movies.csv data
movies_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\movies.csv")

# Load ratings.csv data
ratings_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\ratings.csv")


In [12]:
# Ask for user ID
user_id = int(input("Enter your user ID: "))

# Filter ratings for the user
user_ratings = ratings_df[ratings_df['userId'] == user_id]

# Sort ratings by rating value (descending) and timestamp (descending)
user_ratings_sorted = user_ratings.sort_values(by=['rating', 'timestamp'], ascending=[False, False])

In [13]:
print(user_ratings_sorted)

       userId  movieId  rating  timestamp
13280     100     1193     5.0  862169731
13293     100     1466     5.0  862169186
13285     100     1358     5.0  862169132
13283     100     1354     5.0  862169101
13261     100      714     5.0  862169065
...       ...      ...     ...        ...
13266     100      778     2.0  862168580
13235     100        5     2.0  862168434
13277     100     1060     1.0  862169132
13275     100      880     1.0  862168751
13267     100      780     1.0  862168373

[61 rows x 4 columns]


In [14]:
# Select top 10 highest rated movies by the user
top_rated_movies = []
for rating_value in sorted(user_ratings['rating'].unique(), reverse=True):
    movies_with_rating = user_ratings_sorted[user_ratings_sorted['rating'] == rating_value]
    top_rated_movies.extend(movies_with_rating.head(10 - len(top_rated_movies)).index)

# Extract genres of selected movies
selected_genres = movies_df.loc[top_rated_movies, 'genres']

In [15]:
print(selected_genres)

13280                         Western
13293             Action|Comedy|Crime
13285    Action|Adventure|Sci-Fi|IMAX
13283            Crime|Drama|Thriller
13261                          Comedy
13282                     Documentary
13268                    Comedy|Drama
13247                           Drama
13276                    Comedy|Drama
13249                Action|Drama|War
Name: genres, dtype: object


In [16]:

# Count occurrences of each genre
genre_counts = selected_genres.str.split('|').explode().value_counts().head(5)
print(genre_counts)

# Get the top 5 most watched genres
top_genres = genre_counts.index.tolist()
print(top_genres)

Drama      5
Comedy     4
Action     3
Crime      2
Western    1
Name: genres, dtype: int64
['Drama', 'Comedy', 'Action', 'Crime', 'Western']


In [19]:

# Filter movies by selected genres
genre_movies = movies_df[movies_df['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

# Convert genres into a single string
genre_string = ' '.join(genre_movies['genres'])


In [20]:
print(genre_string)

Adventure|Animation|Children|Comedy|Fantasy Comedy|Romance Comedy|Drama|Romance Comedy Action|Crime|Thriller Comedy|Romance Action Action|Adventure|Thriller Comedy|Drama|Romance Comedy|Horror Drama Action|Adventure|Romance Crime|Drama Drama|Romance Comedy Comedy Action|Comedy|Crime|Drama|Thriller Comedy|Crime|Thriller Crime|Drama|Horror|Mystery|Thriller Action|Crime|Thriller Drama|Sci-Fi Drama|Romance Drama Children|Drama Drama|Romance Adventure|Drama|Fantasy|Mystery|Sci-Fi Crime|Drama Drama Children|Drama Drama|Romance Crime|Drama Children|Comedy Comedy|Romance Drama Drama|War Action|Crime|Drama Drama Action|Adventure|Fantasy Comedy|Drama|Thriller Drama|Romance Animation|Children|Drama|Musical|Romance Drama|Romance Crime|Mystery|Thriller Action|Drama|Thriller Comedy|Drama|Romance Adventure|Drama Children|Comedy Drama Adventure|Children|Comedy|Fantasy Drama Comedy|Drama|Romance Drama|Mystery Drama|Thriller Drama Comedy|Crime Comedy|Romance Comedy Action|Sci-Fi|Thriller Drama Comedy|Rom

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
print(tfidf)

# Fit and transform the genres column
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])
print(tfidf_matrix)

TfidfVectorizer(stop_words='english')
  (0, 8)	0.4967483702845257
  (0, 4)	0.27771718920269134
  (0, 3)	0.48808437174545455
  (0, 2)	0.48833048769293214
  (0, 1)	0.44656600888161224
  (1, 8)	0.6004535115193032
  (1, 3)	0.5899807477262311
  (1, 1)	0.5397946811673262
  (2, 18)	0.8011493881971549
  (2, 4)	0.5984644164788115
  (3, 7)	0.44022013245613556
  (3, 18)	0.7193439273049612
  (3, 4)	0.5373551425545094
  (4, 4)	1.0
  (5, 20)	0.5370772735955626
  (5, 5)	0.6249107985241872
  (5, 0)	0.5665990611314318
  (6, 18)	0.8011493881971549
  (6, 4)	0.5984644164788115
  (7, 3)	0.7377898038839786
  (7, 1)	0.6750305217431583
  (8, 0)	1.0
  (9, 20)	0.5220827968697404
  (9, 0)	0.5507803757155867
  (9, 1)	0.6512069801063765
  :	:
  (62408, 8)	0.771028790321875
  (62409, 22)	1.0
  (62410, 7)	1.0
  (62411, 4)	1.0
  (62412, 6)	0.6432580604344854
  (62412, 2)	0.7656494417721884
  (62413, 7)	1.0
  (62414, 12)	0.6946781180206175
  (62414, 5)	0.7193207298162155
  (62415, 14)	0.7071067811865475
  (62415, 11)	

In [22]:
# Recommend movies based on the closest genres
def recommend_movies_based_on_genres(closest_genres, df, tfidf_vectorizer, top_n=10):
    # Combine closest genres into a single string
    combined_genres = ' '.join(closest_genres)
    # Transform combined genres into TF-IDF vector
    input_tfidf = tfidf_vectorizer.transform([combined_genres])
    # Compute cosine similarity between combined genres and all movies
    cosine_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    print(cosine_scores)
    # Get indices of top n movies with highest cosine similarity
    top_indices = cosine_scores.argsort()[-top_n:][::-1]
    # Retrieve movie titles based on top indices
    recommended_movies = df.iloc[top_indices]['title']
    
    return recommended_movies

In [23]:
recommended_movies = recommend_movies_based_on_genres(top_genres, movies_df, tfidf)

[0.08742898 0.         0.18840437 ... 0.40696759 0.         0.35117956]


In [26]:
print("For user ID:",user_id,", recommended movies are:")
print(recommended_movies)


For user ID: 100 , recommended movies are:
10594                             Bandidas (2006)
52664    A Minute To Pray, A Second To Die (1968)
11811                         3:10 to Yuma (2007)
42133                       Devil's Canyon (1953)
10888                       Mad Dog Morgan (1976)
24807                         Outrage, The (1964)
31006                     The Female Bunch (1971)
7274                             Ned Kelly (1970)
51328                          Boss Nigger (1975)
42289         The Legend of Nigger Charley (1972)
Name: title, dtype: object
