In [None]:
import pandas as pd
import numpy as np
import uuid
from scipy.spatial.distance import pdist, squareform

# File paths
PERSONALITY_DATA_FILE = '../datasets/2018-personality-data.csv'
RATINGS_DATA_FILE = '../datasets/2018_ratings.csv'
MOVIES_DATA_FILE = '../datasets/movies.csv'

MIN_RATINGS_COUNT = 5


In [None]:
# Dataloading and preprocessing function

def load_data():
    
    personality_data = pd.read_csv(PERSONALITY_DATA_FILE)
    ratings_data = pd.read_csv(RATINGS_DATA_FILE)
    movies_data = pd.read_csv(MOVIES_DATA_FILE)
    return personality_data, ratings_data, movies_data

def save_new_user_data(new_user_data):
    personality_data = pd.read_csv(PERSONALITY_DATA_FILE)
    personality_data = preprocess_personality_data(personality_data)

    # Ensure the new user ID is unique
    if new_user_data['userid'] in personality_data['userid'].values:
        raise ValueError(f"User ID {new_user_data['userid']} already exists.")

    # Convert new_user_data to DataFrame
    new_user_df = pd.DataFrame([new_user_data])

    # Append new_user_df to personality_data
    updated_personality_data = pd.concat([personality_data, new_user_df], ignore_index=True)
    updated_personality_data.to_csv(PERSONALITY_DATA_FILE, index=False)
    return updated_personality_data

def update_user_data(updated_user_data):
    
    personality_data = pd.read_csv(PERSONALITY_DATA_FILE)
    user_id = updated_user_data['userid']
    personality_data.loc[personality_data['userid'] == user_id, updated_user_data.keys()] = updated_user_data.values()
    personality_data.to_csv(PERSONALITY_DATA_FILE, index=False)

def get_user_personality_data(user_id):
    
    personality_data = pd.read_csv(PERSONALITY_DATA_FILE)
    user_data = personality_data[personality_data['userid'] == user_id]
    if not user_data.empty:
        return user_data.iloc[0].to_dict()
    return None


In [None]:
# Recommendation Engine

def preprocess_personality_data(data):
    
    # Strip whitespace from column names
    data.columns = data.columns.str.strip()
    # Select only relevant columns
    relevant_columns = ['userid', 'openness', 'agreeableness', 'emotional_stability', 'conscientiousness', 'extraversion']
    # Create a new DataFrame with only the relevant columns
    data = data[relevant_columns]
    # Drop duplicates based on 'userid'
    data = data.drop_duplicates(subset=['userid'])
    return data


def preprocess_ratings_data(data):
    
    data.columns = data.columns.str.strip()
    return data.groupby(['useri', 'movie_id']).agg({'rating': 'mean'}).reset_index()

def create_user_profiles():    
    user_profiles = personality_data.set_index('userid')
    return user_profiles

def calculate_similarity(user_profiles, k):
    
    correlation_matrix = pd.DataFrame(squareform(pdist(user_profiles, metric='correlation')), columns=user_profiles.index, index=user_profiles.index)
    top_k_similarities = correlation_matrix.apply(lambda row: row.nlargest(k+1).iloc[1:], axis=1)
    return top_k_similarities

def find_unrated_movies(user_id, ratings_data, movies_data):
    
    rated_movie_ids = ratings_data.loc[ratings_data['useri'] == user_id, 'movie_id']
    unrated_movies = movies_data[~movies_data['movieId'].isin(rated_movie_ids)]
    return unrated_movies

def predict_movie_ratings(user_id, unrated_movies, top_k_similarities, ratings_data, k):
    
    top_k_users = top_k_similarities.loc[user_id].nlargest(k + 1).iloc[1:].index
    filtered_ratings = ratings_data[ratings_data['useri'].isin(top_k_users)]

    collaborative_predictions = []
    fallback_predictions = []

    for movie_id in unrated_movies['movieId']:
        relevant_ratings = filtered_ratings[filtered_ratings['movie_id'] == movie_id]['rating']
        if not relevant_ratings.empty:
            weighted_rating = np.average(relevant_ratings, weights=range(1, len(relevant_ratings) + 1))
            collaborative_predictions.append((movie_id, weighted_rating))
        else:
            all_users_relevant_ratings = ratings_data[ratings_data['movie_id'] == movie_id]['rating']
            predicted_rating = all_users_relevant_ratings.mean() if not all_users_relevant_ratings.empty else np.nan
            if not np.isnan(predicted_rating):
                fallback_predictions.append((movie_id, predicted_rating))

    sorted_collaborative_predictions = sorted(collaborative_predictions, key=lambda x: x[1], reverse=True)
    sorted_fallback_predictions = sorted(fallback_predictions, key=lambda x: x[1], reverse=True)
    combined_predictions = sorted_collaborative_predictions + sorted_fallback_predictions

    return combined_predictions

def filter_movies_by_rating_count(ratings_data, movies_data):
    
    ratings_count = ratings_data['movie_id'].value_counts()
    frequently_rated_movies = ratings_count[ratings_count >= MIN_RATINGS_COUNT].index
    return movies_data[movies_data['movieId'].isin(frequently_rated_movies)]

def recommend_movies_for_new_user(new_user_id, new_user_data, updated_personality_data, ratings_data, movies_data, k, top_n=10):
    
    # Use the updated personality data directly to create user profiles
    user_profiles = updated_personality_data.set_index('userid')

    # Filter movies based on rating count
    filtered_movies_data = filter_movies_by_rating_count(ratings_data, movies_data)

    # Find unrated movies for the new user
    unrated_movies = find_unrated_movies(new_user_id, ratings_data, filtered_movies_data)
    top_k_similarities = calculate_similarity(user_profiles, k)
    
    # Get top k similar users, excluding the user itself
    similar_users = top_k_similarities.loc[new_user_id].nlargest(k + 1).iloc[1:].index.tolist()

    # Predict ratings for unrated movies
    predicted_ratings = predict_movie_ratings(new_user_id, unrated_movies, top_k_similarities, ratings_data, k)

    # Sort the predicted ratings and select the top_n recommendations
    top_predictions = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:top_n]

    user_recommendations = [
        ( 
            int(movie_id),
            filtered_movies_data[filtered_movies_data['movieId'] == movie_id]['title'].iloc[0], 
            rating
        ) 
        for movie_id, rating in top_predictions
    ]
    
    return user_recommendations

def recommend_movies_for_old_user(user_id, personality_data, ratings_data, movies_data, k, top_n=10):
    
    if user_id in personality_data['userid'].values:
        existing_user_data = personality_data[personality_data['userid'] == user_id].iloc[0].to_dict()
        # Pass the original personality data as it is for existing users
        return recommend_movies_for_new_user(user_id, existing_user_data, personality_data, ratings_data, movies_data, k, top_n)
    else:
        return None

def get_top_genres_from_movies(recommended_movies, movies_data, num_genres):
    
    # Adjust the unpacking to match the tuple structure
    recommended_movie_ids = [movie_id for movie_id, _, _ in recommended_movies]
    recommended_movie_genres = movies_data[movies_data['movieId'].isin(recommended_movie_ids)]['genres']
    genre_counts = recommended_movie_genres.str.split('|').explode().value_counts()
    top_genres = genre_counts.head(num_genres).index.tolist()
    return top_genres

def recommend_genres_for_old_user(user_id, personality_data, ratings_data, movies_data, k, num_genres):
    
    recommended_movies = recommend_movies_for_old_user(user_id, personality_data, ratings_data, movies_data, k, num_genres)
    if recommended_movies:
        return get_top_genres_from_movies(recommended_movies, movies_data, num_genres)
    else:
        return None


In [None]:
# Functions to recommend movies and genres for all users with personality trait scores
def recommend_movies_with_personality_scores_for_all_users(personality_data, ratings_data, movies_data, k, top_n):
    user_movie_recommendations = []
    for index, user_row in personality_data.iterrows():
        user_id = user_row['userid']
        recommendations = recommend_movies_for_old_user(user_id, personality_data, ratings_data, movies_data, k, top_n)
        if recommendations:
            top_movies = ', '.join([title for _, title, _ in recommendations])
            user_movie_recommendations.append({
                'userid': user_id,
                'openness': user_row['openness'],
                'agreeableness': user_row['agreeableness'],
                'emotional_stability': user_row['emotional_stability'],
                'conscientiousness': user_row['conscientiousness'],
                'extraversion': user_row['extraversion'],
                'top_movies': top_movies
            })
    return pd.DataFrame(user_movie_recommendations)

In [None]:
def recommend_genres_with_personality_scores_for_all_users(personality_data, ratings_data, movies_data, k, num_genres):
    user_genre_recommendations = []
    for index, user_row in personality_data.iterrows():
        user_id = user_row['userid']
        genres = recommend_genres_for_old_user(user_id, personality_data, ratings_data, movies_data, k, num_genres)
        if genres:
            user_genre_recommendations.append({
                'userid': user_id,
                'openness': user_row['openness'],
                'agreeableness': user_row['agreeableness'],
                'emotional_stability': user_row['emotional_stability'],
                'conscientiousness': user_row['conscientiousness'],
                'extraversion': user_row['extraversion'],
                'top_genres': ', '.join(genres)
            })
    return pd.DataFrame(user_genre_recommendations)

In [None]:
# Load and preprocess data
personality_data, ratings_data, movies_data = load_data()
personality_data = preprocess_personality_data(personality_data)
ratings_data = preprocess_ratings_data(ratings_data)

# Define parameters
top_n = 10  # Define the number of top movie recommendations
num_genres = 5  # Define the number of top genres
k = 50  # Define the number of similar users to consider

In [None]:
# Generate movie recommendations for all users
movie_recommendations_df = recommend_movies_with_personality_scores_for_all_users(personality_data, ratings_data, movies_data, k, top_n)

# Save to CSV
movie_recommendations_df.to_csv('personality_based_users_movie_recommendation_dataset.csv', index=False)

In [None]:
# Generate genre recommendations for all users
genre_recommendations_df = recommend_genres_with_personality_scores_for_all_users(personality_data, ratings_data, movies_data, k, num_genres)

# Save to CSV
genre_recommendations_df.to_csv('personality_based_users_genre_recommendation_dataset.csv', index=False)