In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, concatenate
import ast

In [None]:
# Load movie data from the specified file
file_path = '/content/drive/MyDrive/final project sample data/10000 Movies Data'
minfo = pd.read_csv(file_path)
movie = minfo[['Movie_id', 'title', 'Genres', 'popularity', 'vote_average', 'vote_count']]

In [None]:
# Sample user data
user_data = {
    'id': [1, 2, 3],
    'happy_movie': [['Drama', 'Comedy'], ['Romance', 'Family'], ['Adventure']],
    'sad_movie': [['Drama', 'Thriller'], ['Crime'], ['Horror', 'Science Fiction']],
    'neutral_movie': [['Comedy'], ['Action', 'Adventure'], ['Crime']],
    'fav_movie': ['The Godfather', "Schindler's List", 'The Shawshank Redemption']
}
user = pd.DataFrame(user_data)

In [None]:
# Sample watched data
watched_data = {
    'userid': [1, 2, 3, 1, 2],
    'movieid': [238, 278, 240, 19404, 122],
    'rating_given': [8.0, 5.5, 9.5, 7.0, 6.5],
    'watching_mood': ['Happy', 'Sad', 'Neutral', 'Happy', 'Neutral']
}
watched = pd.DataFrame(watched_data)

In [None]:
# Normalize movie features
scaler = MinMaxScaler()
movie[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(movie[['popularity', 'vote_average', 'vote_count']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(movie[['popularity', 'vote_average', 'vote_count']])


In [None]:
movie['Genres']

0       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
1       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
2       [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...
3       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4       [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...
                              ...                        
9975    [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
9976                       [{'id': 27, 'name': 'Horror'}]
9977    [{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...
9978                       [{'id': 28, 'name': 'Action'}]
9979                       [{'id': 27, 'name': 'Horror'}]
Name: Genres, Length: 9980, dtype: object

In [None]:
def extract_genres(genres_str):
    # Convert the string representation of list of dictionaries into actual list of dictionaries
    genres_list = ast.literal_eval(genres_str)
    # Extract the 'name' value from each dictionary
    genres = [genre['name'] for genre in genres_list]
    return genres

In [None]:
movie['Genres'] = movie['Genres'].apply(extract_genres)

ValueError: malformed node or string: ['Drama', 'Crime']

In [None]:
movie['Genres']

0                   [Drama, Crime]
1                   [Drama, Crime]
2                   [Drama, Crime]
3         [Comedy, Drama, Romance]
4            [Drama, History, War]
                   ...            
9975     [Action, Crime, Thriller]
9976                      [Horror]
9977    [Comedy, Family, TV Movie]
9978                      [Action]
9979                      [Horror]
Name: Genres, Length: 9980, dtype: object

In [None]:
mlb = MultiLabelBinarizer()
movie_genres = mlb.fit_transform(movie['Genres'])
movie_genres_df = pd.DataFrame(movie_genres, columns=mlb.classes_, index=movie['Movie_id'])


In [None]:
# Encode user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user['encoded_id'] = user_encoder.fit_transform(user['id'])
movie['encoded_id'] = movie_encoder.fit_transform(movie['Movie_id'])
watched['encoded_userid'] = user_encoder.transform(watched['userid'])
watched['encoded_movieid'] = movie_encoder.transform(watched['movieid'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['encoded_id'] = movie_encoder.fit_transform(movie['Movie_id'])


In [None]:
# Merge watched data with user and movie data
merged_df = pd.merge(watched, user, left_on='encoded_userid', right_on='encoded_id', how='left')
merged_df = pd.merge(merged_df, movie, left_on='encoded_movieid', right_on='encoded_id', how='left')

# Prepare inputs and outputs for the DNN
user_ids = merged_df['encoded_userid'].values.reshape(-1, 1)
movie_ids = merged_df['encoded_movieid'].values.reshape(-1, 1)

In [None]:
if not set(merged_df['encoded_movieid']).issubset(set(movie_genres_df.index)):
    missing_ids = set(merged_df['encoded_movieid']) - set(movie_genres_df.index)
    print(f"Missing IDs: {missing_ids}")
    # Handle missing IDs as needed, e.g., filtering out rows with missing IDs or adjusting the indexing



Missing IDs: {160, 190}


In [None]:
genres_input = movie_genres_df.to_numpy()


In [None]:
genres_input


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:

# Define the DNN model
def build_model(num_users, num_movies, num_genres):
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(num_users, 8, name='user_embedding')(user_input)
    user_vec = Flatten(name='user_flatten')(user_embedding)

    movie_input = Input(shape=(1,), name='movie_input')
    movie_embedding = Embedding(num_movies, 8, name='movie_embedding')(movie_input)
    movie_vec = Flatten(name='movie_flatten')(movie_embedding)

    genres_input = Input(shape=(num_genres,), name='genres_input')
    concat = concatenate([user_vec, movie_vec, genres_input], axis=-1)

    dense = Dense(128, activation='relu')(concat)
    dropout = Dropout(0.5)(dense)
    output = Dense(1, activation='linear')(dropout)

    model = Model(inputs=[user_input, movie_input, genres_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Instantiate and train the model
num_users = user['encoded_id'].nunique()
num_movies = movie['encoded_id'].nunique()
num_genres = len(mlb.classes_)

model = build_model(num_users, num_movies, num_genres)

selected_genres_input = genres_input[movie_ids.flatten()]

history = model.fit([user_ids, movie_ids, selected_genres_input], merged_df['rating_given'].values, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:


# Assuming user_id and current_mood are given
user_id = 2  # Example user ID, dynamically set
current_mood = 'happy'  # User's current mood, dynamically set

# Function to retrieve preferred genres for a given user_id and mood
def get_preferred_genres(user_id, current_mood, user_data):
    mood_to_column = {
        'happy': 'happy_movie',
        'sad': 'sad_movie',
        'neutral': 'neutral_movie'
    }
    mood_column = mood_to_column[current_mood]
    preferred_genres = user_data.loc[user_data['id'] == user_id, mood_column].values[0]
    return preferred_genres

# Retrieve preferred genres for the current user and mood
preferred_genres = get_preferred_genres(user_id, current_mood, user)
print(f"User {user_id}'s preferred genres when {current_mood}: {preferred_genres}")



User 2's preferred genres when happy: ['Romance', 'Family']


In [None]:

# Encode user ID and watched movie IDs
encoded_user_id = user_encoder.transform([user_id])[0]
watched_movie_ids = watched[watched['userid'] == user_id]['movieid'].tolist()
encoded_watched_movie_ids = movie_encoder.transform(watched_movie_ids)

In [None]:
# Identify all possible encoded movie IDs and filter out watched ones to find unwatched movie IDs
unwatched_movie_ids = np.setdiff1d(movie['encoded_id'].values, encoded_watched_movie_ids)

# Intersection with movie_genres_df index to ensure valid access
valid_unwatched_movie_ids = np.intersect1d(unwatched_movie_ids, movie_genres_df.index)

# Filter unwatched movies by preferred genres
unwatched_and_preferred = movie_genres_df.loc[valid_unwatched_movie_ids]
unwatched_and_preferred_indices = movie_genres_df.columns.intersection(preferred_genres)
unwatched_and_preferred = unwatched_and_preferred[(unwatched_and_preferred[unwatched_and_preferred_indices].sum(axis=1) > 0)].index.values

# Prepare model inputs
user_ids_input = np.array([encoded_user_id] * len(unwatched_and_preferred))
movie_ids_input = unwatched_and_preferred
genres_input_filtered = genres_input[movie_ids_input]

In [None]:
# Predict ratings using the model
predicted_ratings = model.predict([user_ids_input, movie_ids_input, genres_input_filtered])

# Pair each predicted rating with its corresponding movie ID
predicted_ratings_with_ids = list(zip(movie_ids_input, predicted_ratings.flatten()))

# Filter out movies that do not match the preferred genres AFTER prediction
# This ensures recommendations are aligned with the user's genre preferences
filtered_predictions_with_ids = [pair for pair in predicted_ratings_with_ids if movie_genres_df.loc[pair[0], preferred_genres].any()]

# Sort the filtered movies based on predicted ratings to find top recommendations
filtered_predictions_with_ids.sort(key=lambda x: x[1], reverse=True)  # Sort by rating in descending order
top_recommendations = filtered_predictions_with_ids[:5]  # Adjust the number of recommendations as needed

# Extract the movie IDs from the top recommendations
top_movie_ids = [pair[0] for pair in top_recommendations]

# Decode the recommended movie IDs to their original IDs for presentation
recommended_movie_ids = movie_encoder.inverse_transform(top_movie_ids)

# Fetch recommended movie titles and genres
recommended_movies_info = movie[movie['Movie_id'].isin(recommended_movie_ids)][['title', 'Genres']]
print("Top recommended movies for user when happy, after filtering by genre:")
print(recommended_movies_info.to_string(index=False))


Top recommended movies for user when happy, after filtering by genre:
                  title                             Genres
                  Given [Animation, Drama, Music, Romance]
          Me Before You                   [Drama, Romance]
                  Moxie             [Comedy, Drama, Music]
              The Cured   [Horror, Drama, Science Fiction]
Approaching the Unknown [Thriller, Drama, Science Fiction]


In [None]:


# # Identify all possible encoded movie IDs and filter out watched ones to find unwatched movie IDs
# unwatched_movie_ids = np.setdiff1d(movie['encoded_id'].values, encoded_watched_movie_ids)

# # Intersection with movie_genres_df index to ensure valid access
# valid_unwatched_movie_ids = np.intersect1d(unwatched_movie_ids, movie_genres_df.index)

# # Filter unwatched movies by preferred genres
# unwatched_and_preferred = movie_genres_df.loc[valid_unwatched_movie_ids]
# unwatched_and_preferred_indices = movie_genres_df.columns.intersection(preferred_genres)
# unwatched_and_preferred = unwatched_and_preferred[(unwatched_and_preferred[unwatched_and_preferred_indices].sum(axis=1) > 0)].index.values

# # Prepare model inputs
# user_ids_input = np.array([encoded_user_id] * len(unwatched_and_preferred))
# movie_ids_input = unwatched_and_preferred
# genres_input_filtered = genres_input[movie_ids_input]

# # Predict ratings
# predicted_ratings = model.predict([user_ids_input, movie_ids_input, genres_input_filtered])

# # Sort movies based on predicted ratings
# top_indices = np.argsort(-predicted_ratings.flatten())[:5]  # Adjust the number of recommendations as needed
# top_movie_ids = movie_ids_input[top_indices]

# # Decode recommended movie IDs to original IDs
# recommended_movie_ids = movie_encoder.inverse_transform(top_movie_ids)

# # Fetch recommended movie titles and genres
# recommended_movies_info = movie[movie['Movie_id'].isin(recommended_movie_ids)][['title', 'Genres']]
# print("Top recommended movies for user when happy:")
# print(recommended_movies_info.to_string(index=False))

In [1]:
# Extract the year from 'Release_Date'
df['Release_Year'] = pd.to_datetime(df['release_date']).dt.year


#Creating 'Release_Era' based on 'Release_Year'
def categorize_year(year):
    if year >= 2023:
        return 'latest'
    elif 2010 <= year < 2023:
        return 'mid'
    else:
        return 'old'

df['Release_Era'] = df['Release_Year'].apply(categorize_year)

NameError: name 'pd' is not defined