In [2]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load movies.csv data
movies_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\movies.csv")

# Load ratings.csv data
ratings_df = pd.read_csv(r"C:\Users\karrt\Documents\ProjectGRO\Diff_proj\ratings.csv")

In [25]:
# Remove rows where genre is "(no genres listed)"
movies_df = movies_df[movies_df['genres'] != "(no genres listed)"]

# Verify the removal by printing the shape of the cleaned DataFrame
print("Shape of cleaned DataFrame:", movies_df.shape)

Shape of cleaned DataFrame: (57361, 3)


In [3]:
# Ask for user ID
user_id = int(input("Enter your user ID: "))

# Filter ratings for the user and sort by timestamp in descending order
user_ratings = ratings_df[ratings_df['userId'] == user_id].sort_values(by='timestamp', ascending=False)

# Get the recent 10 movies watched by the user
recent_movies = user_ratings.head(10)

In [21]:
# Get the genres of the recent movies
recent_movie_ids = recent_movies['movieId'].tolist()
recent_movie_genres = movies_df[movies_df['movieId'].isin(recent_movie_ids)]['genres']
print(recent_movie_genres)

# # Count the occurrences of each genre in the recent movies
# genre_counts = ' '.join(recent_movie_genres).split('|')
# print(genre_counts)
# genre_counts = pd.Series(genre_counts).value_counts().head(5)
# print(genre_counts)
# Count the occurrences of each genre in the recent movies
genre_counts = recent_movie_genres.str.split('|').explode().value_counts().head(5)
print(genre_counts)

# # Get the top 5 most watched genres from the recent movies (without duplicates)
top_genres = genre_counts.index.unique().tolist()[:5]

seen = set()
top_genres_unique = [x for x in top_genres if not (x in seen or seen.add(x))]

211                                             Drama
287                                       Crime|Drama
522                                         Drama|War
536                            Action|Sci-Fi|Thriller
582                           Adventure|Drama|Western
972                                      Comedy|Drama
1164                                            Drama
1329                                            Drama
1363                                            Drama
1422    Crime|Drama|Fantasy|Film-Noir|Mystery|Romance
Name: genres, dtype: object
Drama     9
Crime     2
War       1
Action    1
Sci-Fi    1
Name: genres, dtype: int64


In [22]:
print(top_genres_unique)
print(recent_movie_genres)

['Drama', 'Crime', 'War', 'Action', 'Sci-Fi']
211                                             Drama
287                                       Crime|Drama
522                                         Drama|War
536                            Action|Sci-Fi|Thriller
582                           Adventure|Drama|Western
972                                      Comedy|Drama
1164                                            Drama
1329                                            Drama
1363                                            Drama
1422    Crime|Drama|Fantasy|Film-Noir|Mystery|Romance
Name: genres, dtype: object


In [23]:
# Filter movies by genres
genre_movies = movies_df[movies_df['genres'].apply(lambda x: any(genre in x for genre in top_genres))]

# Convert genres into a single string
genre_string = ' '.join(genre_movies['genres'])

In [24]:
print(len(genre_movies))
print(genre_string)

34220
Comedy|Drama|Romance Action|Crime|Thriller Action Action|Adventure|Thriller Comedy|Drama|Romance Drama Action|Adventure|Romance Crime|Drama Drama|Romance Action|Comedy|Crime|Drama|Thriller Comedy|Crime|Thriller Crime|Drama|Horror|Mystery|Thriller Action|Crime|Thriller Drama|Sci-Fi Drama|Romance Drama Children|Drama Drama|Romance Adventure|Drama|Fantasy|Mystery|Sci-Fi Crime|Drama Drama Mystery|Sci-Fi|Thriller Children|Drama Drama|Romance Crime|Drama Drama Drama|War Action|Crime|Drama Drama Action|Adventure|Fantasy Comedy|Drama|Thriller Drama|Romance Animation|Children|Drama|Musical|Romance Drama|Romance Crime|Mystery|Thriller Action|Drama|Thriller Comedy|Drama|Romance Adventure|Drama Drama Drama Comedy|Drama|Romance Drama|Mystery Drama|Thriller Drama Comedy|Crime Action|Sci-Fi|Thriller Drama Action|Comedy|Horror|Thriller Action Comedy|Drama Drama|War Drama|Romance Comedy|Drama Action|Sci-Fi|Thriller Action|Crime|Drama|Thriller Drama|Thriller Children|Drama Crime|Drama|Romance Come

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
print(tfidf)

# Fit and transform the genres column
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])
print(tfidf_matrix)

TfidfVectorizer(stop_words='english')
  (0, 8)	0.4976968060348268
  (0, 4)	0.27365872132978053
  (0, 3)	0.4888347543899718
  (0, 2)	0.4890864963803857
  (0, 1)	0.4463673100281161
  (1, 8)	0.6009422978573427
  (1, 3)	0.5902418440577317
  (1, 1)	0.5389646743240835
  (2, 16)	0.8038846390656949
  (2, 4)	0.5947852444994054
  (3, 7)	0.4350568647922227
  (3, 16)	0.7238201237000819
  (3, 4)	0.5355464059481306
  (4, 4)	1.0
  (5, 18)	0.5359538126070666
  (5, 5)	0.6261673282681712
  (5, 0)	0.5662755404937154
  (6, 16)	0.8038846390656949
  (6, 4)	0.5947852444994054
  (7, 3)	0.7384554608732243
  (7, 1)	0.6743022558960589
  (8, 0)	1.0
  (9, 18)	0.5205601448645955
  (9, 0)	0.5500109719506804
  (9, 1)	0.6530735535239652
  :	:
  (57345, 6)	1.0
  (57346, 6)	1.0
  (57347, 6)	1.0
  (57348, 6)	0.6351272445974119
  (57348, 8)	0.7724075240247854
  (57349, 20)	1.0
  (57350, 7)	1.0
  (57351, 4)	1.0
  (57352, 6)	0.6417275449405034
  (57352, 2)	0.7669326946118769
  (57353, 7)	1.0
  (57354, 11)	0.6943586936948832

In [32]:
# Recommend movies based on the closest genres
def recommend_movies_based_on_genres(closest_genres, df, tfidf_vectorizer, top_n=10):
    # Combine closest genres into a single string
    combined_genres = ' '.join(closest_genres)
    # Transform combined genres into TF-IDF vector
    input_tfidf = tfidf_vectorizer.transform([combined_genres])
    # Compute cosine similarity between combined genres and all movies
    cosine_scores = cosine_similarity(input_tfidf, tfidf_matrix).flatten()
    print(cosine_scores)
    # Get indices of top n movies with highest cosine similarity
    top_indices = cosine_scores.argsort()[-top_n:][::-1]
    # Retrieve movie titles based on top indices
    recommended_movies = df.iloc[top_indices]['title']
    
    return recommended_movies


In [33]:
recommended_movies = recommend_movies_based_on_genres(top_genres, movies_df, tfidf)

[0.         0.         0.         ... 0.         0.13426906 0.29257331]


In [34]:
print("Recommended Movies:")
print(recommended_movies)







# # Use CountVectorizer to create a genre matrix
# vectorizer = CountVectorizer()
# genre_matrix = vectorizer.fit_transform([genre_string])

# # Compute cosine similarity between genre matrix and each movie
# cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

# # Get indices of top 10 movie recommendations based on cosine similarity
# top_indices = cosine_sim[0].argsort()[-11:-1][::-1]

# # Print top 10 movie recommendations
# print("\nTop 10 movie recommendations based on recent genres:")
# for idx in top_indices:
#     print("Movie ID:", genre_movies.iloc[idx]['movieId'], "| Title:", genre_movies.iloc[idx]['title'])


Recommended Movies:
29750    Flight World War II (2015)
17842          Zone Troopers (1985)
44769           G.I. Samurai (1979)
2231                 Soldier (1998)
16125    Battle: Los Angeles (2011)
22553         Whip Hand, The (1951)
30121               Criminal (2016)
31447            Tokyo Tribe (2014)
61195         Men Must Fight (1933)
45950                The Fog (2010)
Name: title, dtype: object
