In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from itertools import chain

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#ashish + harshit get user input here
pref_era='latest'
userid=1
curr_mood='Happy'

In [None]:
#ashish + harshit get user_data and watched_data from database, get movie data from file
# Sample user data
user_data = {
    'id': [1, 2, 3],
    'happy_movie': [['Drama', 'Comedy'], ['Romance', 'Family'], ['Adventure']],
    'sad_movie': [['Drama', 'Thriller'], ['Crime'], ['Horror', 'Science Fiction']],
    'neutral_movie': [['Comedy'], ['Action', 'Adventure'], ['Crime']],
    'fav_movie': ['The Godfather', "Schindler's List", 'The Shawshank Redemption']
}
user = pd.DataFrame(user_data)


# Load movie data from the specified file
file_path = '/content/drive/MyDrive/final project sample data/tmdb2024.csv'
movie1 = pd.read_csv(file_path, index_col=0)
#movie = minfo[['Movie_id', 'title', 'Genres', 'popularity', 'vote_average', 'vote_count']]



# Sample watched data
watched_data = {
    'userid': [1, 2, 3, 1, 2],
    'movieid': [157336, 155, 673629, 673629, 19995],
    'rating_given': [8.0, 5.5, 9.5, 7.0, 6.5],
    'watching_mood': ['Happy', 'Sad', 'Neutral', 'Happy', 'Neutral']
}
watched = pd.DataFrame(watched_data)

In [None]:
movie = movie1[movie1['Release_Era'] == pref_era]

In [None]:
movie.loc[:, 'Genres'] = movie['Genres'].apply(lambda x: [{'genre': genre} for genre in ast.literal_eval(x)])
movie.loc[:, 'Genres'] = movie['Genres'].apply(lambda x: [i['genre'] for i in x])

In [None]:
movie_genres = movie.copy()

In [None]:
movie_genres.rename(columns={'id': 'Movie_id'}, inplace=True)

In [None]:
watched_movies = pd.merge(watched, movie_genres, left_on='movieid', right_on='Movie_id')

In [None]:
def expand_user_genres(user_df, mood_column):
    expanded_df = user_df[['id', mood_column]].explode(mood_column)
    expanded_df['mood'] = mood_column.split('_')[0].capitalize()
    expanded_df = expanded_df.rename(columns={mood_column: 'genre'})
    return expanded_df

In [None]:
happy_genres = expand_user_genres(user, 'happy_movie')
sad_genres = expand_user_genres(user, 'sad_movie')
neutral_genres = expand_user_genres(user, 'neutral_movie')
all_genres = pd.concat([happy_genres, sad_genres, neutral_genres])
all_genres.reset_index(drop=True, inplace=True)

In [None]:
mlb = MultiLabelBinarizer()
movie_genres_encoded = mlb.fit_transform(movie_genres['Genres'])
movie_genres_df = pd.DataFrame(movie_genres_encoded, columns=mlb.classes_, index=movie_genres['Movie_id'])

In [None]:
movie_genres.set_index('Movie_id', inplace=True)

In [None]:
def recommend_movies_with_weighted_genres(user_id, current_mood, top_n=5):
    mood_genres = all_genres[(all_genres['id'] == user_id) & (all_genres['mood'] == current_mood)]
    user_pref_genres = mood_genres['genre'].unique()
    mood_watched_movies = watched_movies[(watched_movies['userid'] == user_id) & (watched_movies['watching_mood'].str.capitalize() == current_mood)]
    genre_ratings = {}
    if not mood_watched_movies.empty:
        mood_watched_movies_details = mood_watched_movies.merge(movie_genres, left_on='movieid', right_index=True)
        for genre in user_pref_genres:
            genre_movies = mood_watched_movies_details[mood_watched_movies_details['Genres_x'].apply(lambda x: genre in x)]
            avg_rating = genre_movies['rating_given'].mean()
            genre_ratings[genre] = avg_rating if not np.isnan(avg_rating) else 0
        max_rating = max(genre_ratings.values(), default=1)
        genre_weights = {genre: rating / max_rating for genre, rating in genre_ratings.items()}
    else:
        genre_weights = {genre: 1 for genre in user_pref_genres}
    mlb = MultiLabelBinarizer()
    movie_genres_encoded = mlb.fit_transform(movie_genres['Genres'])
    genre_prefs_encoded = mlb.transform([user_pref_genres])
    similarity_scores = cosine_similarity(genre_prefs_encoded, movie_genres_encoded)[0]
    weighted_similarity_scores = np.zeros_like(similarity_scores)
    for i, genre in enumerate(mlb.classes_):
        if genre in genre_weights:
            weighted_similarity_scores += similarity_scores * movie_genres_encoded[:, i] * genre_weights[genre]
    scaler = MinMaxScaler()
    normalized_popularity = scaler.fit_transform(movie_genres[['popularity']])
    normalized_vote_average = scaler.fit_transform(movie_genres[['vote_average']])
    final_scores = weighted_similarity_scores + normalized_popularity.flatten() + normalized_vote_average.flatten()
    watched_movie_ids = watched_movies[watched_movies['userid'] == user_id]['movieid'].unique()
    unwatched_indices = [i for i, movie_id in enumerate(movie_genres.index) if movie_id not in watched_movie_ids]
    unwatched_scores = final_scores[unwatched_indices]
    top_indices = np.argsort(unwatched_scores)[-top_n:][::-1]
    recommended_movie_ids = [movie_genres.index[unwatched_indices[i]] for i in top_indices]
    recommended_movies_info = movie_genres.loc[recommended_movie_ids]
    recommended_movies_output = recommended_movies_info['title'].tolist()
    return recommended_movies_output

In [None]:
def get_top_movie_ids(user_id, current_mood, top_n=100, final_n=5):
    recommended_movies = recommend_movies_with_weighted_genres(user_id, current_mood, top_n)
    recommended_movies_list = []
    for title in recommended_movies:
        movie_row = movie_genres[movie_genres['title'] == title]
        recommended_movies_list.append(movie_row)
    recommended_movies_df = pd.concat(recommended_movies_list)
    recommended_movies_df_sorted = recommended_movies_df.sort_values(by='popularity', ascending=False)
    top_movie_ids = recommended_movies_df_sorted.head(final_n).index.tolist()
    return top_movie_ids

In [None]:
top_movie_ids = get_top_movie_ids(userid, curr_mood)
print(top_movie_ids)

[565770, 980489, 937278, 626332, 615777]
