In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout, concatenate

# Load movie data from the specified file
file_path = '/content/drive/MyDrive/final project sample data/10000 Movies Data'
minfo = pd.read_csv(file_path)
movie = minfo[['Movie_id', 'title', 'Genres', 'popularity', 'vote_average', 'vote_count']]

# Sample user data
user_data = {
    'id': [1, 2, 3],
    'happy_movie': [['Drama', 'Comedy'], ['Romance', 'Family'], ['Adventure']],
    'sad_movie': [['Drama', 'Thriller'], ['Crime'], ['Horror', 'Science Fiction']],
    'neutral_movie': [['Comedy'], ['Action', 'Adventure'], ['Crime']],
    'fav_movie': ['The Godfather', "Schindler's List", 'The Shawshank Redemption']
}
user = pd.DataFrame(user_data)

# Sample watched data
watched_data = {
    'userid': [1, 2, 3, 1, 2],
    'movieid': [238, 278, 240, 19404, 424],
    'rating_given': [8.0, 5.5, 9.5, 7.0, 6.5],
    'watching_mood': ['Happy', 'Sad', 'Neutral', 'Happy', 'Neutral']
}
watched = pd.DataFrame(watched_data)

# Normalize movie features
scaler = MinMaxScaler()
movie[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(movie[['popularity', 'vote_average', 'vote_count']])

# Encode genres
def extract_genres(genres_str):
    genres = [g.strip() for g in genres_str.strip("[]").replace("'", "").split(",")]
    return genres

movie['Genres'] = movie['Genres'].apply(extract_genres)

mlb = MultiLabelBinarizer()
movie_genres = mlb.fit_transform(movie['Genres'])
movie_genres_df = pd.DataFrame(movie_genres, columns=mlb.classes_, index=movie['Movie_id'])

# Encode user and movie IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user['encoded_id'] = user_encoder.fit_transform(user['id'])
movie['encoded_id'] = movie_encoder.fit_transform(movie['Movie_id'])
watched['encoded_userid'] = user_encoder.transform(watched['userid'])
watched['encoded_movieid'] = movie_encoder.transform(watched['movieid'])

# Merge watched data with user and movie data
merged_df = pd.merge(watched, user, left_on='encoded_userid', right_on='encoded_id', how='left')
merged_df = pd.merge(merged_df, movie, left_on='encoded_movieid', right_on='encoded_id', how='left')

# Prepare inputs and outputs for the DNN
user_ids = merged_df['encoded_userid'].values.reshape(-1, 1)
movie_ids = merged_df['encoded_movieid'].values.reshape(-1, 1)

# Assuming the above adjustments have been made
# Ensure all encoded_movieid values exist in movie_genres_df's index
if not set(merged_df['encoded_movieid']).issubset(set(movie_genres_df.index)):
    missing_ids = set(merged_df['encoded_movieid']) - set(movie_genres_df.index)
    print(f"Missing IDs: {missing_ids}")
    # Handle missing IDs as needed, e.g., filtering out rows with missing IDs or adjusting the indexing

# Assuming no missing IDs or handling them as needed
genres_input = np.stack(movie_genres_df.reindex(merged_df['encoded_movieid'], fill_value=0).values)



# Define the DNN model
def build_model(num_users, num_movies, num_genres):
    user_input = Input(shape=(1,), name='user_input')
    user_embedding = Embedding(num_users, 8, name='user_embedding')(user_input)
    user_vec = Flatten(name='user_flatten')(user_embedding)

    movie_input = Input(shape=(1,), name='movie_input')
    movie_embedding = Embedding(num_movies, 8, name='movie_embedding')(movie_input)
    movie_vec = Flatten(name='movie_flatten')(movie_embedding)

    genres_input = Input(shape=(num_genres,), name='genres_input')
    concat = concatenate([user_vec, movie_vec, genres_input], axis=-1)

    dense = Dense(128, activation='relu')(concat)
    dropout = Dropout(0.5)(dense)
    output = Dense(1, activation='linear')(dropout)

    model = Model(inputs=[user_input, movie_input, genres_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Instantiate and train the model
num_users = user['encoded_id'].nunique()
num_movies = movie['encoded_id'].nunique()
num_genres = len(mlb.classes_)

model = build_model(num_users, num_movies, num_genres)

history = model.fit([user_ids, movie_ids, genres_input], merged_df['rating_given'].values, epochs=10, batch_size=32)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie[['popularity', 'vote_average', 'vote_count']] = scaler.fit_transform(movie[['popularity', 'vote_average', 'vote_count']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Genres'] = movie['Genres'].apply(extract_genres)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['encoded_id'] =

Missing IDs: {160, 190}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
-------------------------------------------------------------------start from here-------------------

In [None]:
# Extract preferred genres for User 1 when in a happy mood
user_1_happy_genres = user.loc[user['id'] == 1, 'happy_movie'].iloc[0]
print("User 1 prefers these genres when happy:", user_1_happy_genres)


User 1 prefers these genres when happy: ['Drama', 'Comedy']


In [None]:
# Example adjustment for genre matching (simplified for demonstration)
# Assuming movie['Genres'] is a list of genre names
candidate_movies = movie[
    ~movie['Movie_id'].isin(user_1_watched_movies) &
    movie['Genres'].apply(lambda x: bool(set(x) & set(user_1_happy_genres)))
]

print(f"Candidate movies for recommendation: {len(candidate_movies)}")

Candidate movies for recommendation: 0


In [None]:
# Find movies User 1 has watched
user_1_watched_movies = watched[watched['userid'] == 1]['movieid'].unique()

# Filter movies not watched by User 1 and matching the preferred genres
candidate_movies = movie[~movie['Movie_id'].isin(user_1_watched_movies) &
                         movie['Genres'].apply(lambda x: any(genre in x for genre in user_1_happy_genres))]




Candidate movies for recommendation: 0


In [None]:
# Prepare input for prediction
user_1_encoded = user[user['id'] == 1]['encoded_id'].iloc[0]
candidate_movies_encoded_ids = candidate_movies['encoded_id'].values
user_input = np.array([user_1_encoded] * len(candidate_movies_encoded_ids)).reshape(-1, 1)
movie_input = candidate_movies_encoded_ids.reshape(-1, 1)
genres_input = np.stack(movie_genres_df.loc[candidate_movies_encoded_ids].values)

# Predict ratings
predicted_ratings = model.predict([user_input, movie_input, genres_input]).flatten()

# Add predicted ratings to candidate movies
candidate_movies.loc[:, 'predicted_rating'] = predicted_ratings


ValueError: need at least one array to stack