In [None]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random

#generate movies/content items with varied genres
genres = ["action", "thriller", "crime", "sci-fi", "comedy", "animation", "drama", "documentary", "educational", "news", "sports"]
movies = [{"id": i+1, "genre": random.choice(genres)} for i in range(1000)] #set number of movies/items here

#example personas based on genre preferences - you can think of different profiling angles as well
personas = {
    "documentary nerd": {genre: 0.9 if genre == "documentary" else 0.01 for genre in genres},
    "news watcher": {genre: 0.9 if genre == "news" else 0.01 for genre in genres},
    "sci-fi binge-watcher": {genre: 0.9 if genre == "sci-fi" else 0.01 for genre in genres},
    "action movie lover": {genre: 0.9 if genre == "action" else 0.01 for genre in genres},
    "sports fan": {genre: 0.9 if genre == "sports" else 0.01 for genre in genres},
    "varied content consumer": {genre: 1/len(genres) for genre in genres},
}

languages = ["EN", "NL", "DE", "FR", "JAP", "KOR", "IT", "ARAB"] #mock movie language option

#function to randomly assign available languages to movies
def assign_movie_languages(movies, languages):
    for movie in movies:
        #ensure English is always an option - not a must, just an example!
        movie_languages = ["EN"]
        other_langs = random.sample(languages, random.randint(1, len(languages)-1))
        movie_languages.extend([lang for lang in other_langs if lang not in movie_languages])
        movie["available_languages"] = movie_languages
    return movies

movies_with_languages = assign_movie_languages(movies, languages)

#function to select a movie based on persona preferences - makes here sure that personas primarily watch what we would expect them to watch
def select_movie_for_persona(persona):
    preferences = personas[persona]
    available_movies = [movie for movie in movies_with_languages if movie['genre'] in preferences.keys()]
    weights = [preferences[movie['genre']] for movie in available_movies]
    selected_movie = random.choices(available_movies, weights=weights, k=1)[0]
    return selected_movie

def random_date(start, end):
    """
    Generate a random datetime between `start` and `end`.
    Adjust this to generate more evening/weekend times as needed.
    """
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

def generate_user_watching_habit():
    """
    Randomly choose a watching habit: evening/weekend, night, morning, afternoon.
    Adjust proportions as needed to fit the distribution requirements.
    """
    return random.choices(
        ["evening/weekend", "night", "morning", "afternoon"],
        weights=[0.5, 0.2, 0.15, 0.15],
        k=1
    )[0]

def generate_watching_datetime(user_habit):
    """
    Generate a datetime object based on the user's watching habit.
    Adjust the time ranges as needed for more accurate representations.
    """
    base_date = datetime(2021, 1, 1)
    if user_habit == "evening/weekend":
        #setting evenings as period from 18:00 to 22:00, weekends are all day
        chosen_time = random_date(datetime(2021, 1, 1, 18), datetime(2021, 1, 1, 22))
    elif user_habit == "night":
        chosen_time = random_date(datetime(2021, 1, 1, 22), datetime(2021, 1, 2, 5))
    elif user_habit == "morning":
        chosen_time = random_date(datetime(2021, 1, 1, 6), datetime(2021, 1, 1, 10))
    else:  #afternoon
        chosen_time = random_date(datetime(2021, 1, 1, 12), datetime(2021, 1, 1, 18))
    
    #add a random number of days to shift the date within the two-year range
    days_to_add = random.randint(0, 730)  #365 days * 2 years
    final_datetime = (base_date + timedelta(days=days_to_add)).replace(hour=chosen_time.hour, minute=chosen_time.minute, second=chosen_time.second)
    
    return final_datetime.strftime("%Y-%m-%d %H:%M:%S")

def generate_complex_user_data_with_percentage_distribution(n_users=1000, persona_distribution_percents=None): #set number of users here
    if persona_distribution_percents is None:
        persona_distribution_percents = {
            "documentary nerd": 5, 
            "news watcher": 15,
            "sci-fi binge-watcher": 10,
            "action movie lover": 10,
            "sports fan": 10,
            "varied content consumer": 50,
        }
    
    user_data = []

    devices = ["smartphone", "gaming console", "TV", "web browser"]
    device_weights = [0.05, 0.05, 0.85, 0.05]  #weights adjusted to favor TV

    for persona, percent in persona_distribution_percents.items():
        n_persona_users = int((percent / 100.0) * n_users)
        
        for user_index in range(n_persona_users):
            user_habit = generate_user_watching_habit()  #determine user's watching habit
            
            num_interactions = random.randint(1, 10)  #randomize the number of interactions (i.e. how many movies/items a user interacted with)
            for _ in range(num_interactions):
                movie = select_movie_for_persona(persona)
                watched_completely_chance = random.random()
                started_to_watch = "yes" if watched_completely_chance > 0.5 or random.random() > 0.3 else "no"
                
                user_interaction = {
                    "user_id": f"user_{persona}_{user_index+1}",
                    "movie_id": movie["id"],
                    "genre": movie["genre"],
                    "persona": persona,
                    "rating": np.nan if random.random() > 0.7 else random.randint(1, 5),
                    "shared_with_other_users": "yes" if random.random() > 0.9 else "no",
                    "added_to_playlist": "yes" if random.random() > 0.8 else "no",
                    "watched_completely": "yes" if started_to_watch == "yes" and watched_completely_chance > 0.5 else "no",
                    "started_to_watch": started_to_watch,
                    "language_selected": random.choices(movie["available_languages"], 
                                                        weights=[0.8 if lang == "EN" else 0.025 for lang in movie["available_languages"]],
                                                        k=1)[0],
                    "Device": random.choices(devices, weights=device_weights, k=1)[0],
                    "Date & Time of Watching": generate_watching_datetime(user_habit) if started_to_watch == "yes" else None,
                }
                user_data.append(user_interaction)

    return pd.DataFrame(user_data)

#generate and display the synthetic user data
user_data = generate_complex_user_data_with_percentage_distribution() 
user_data.head()

In [None]:
# Filter rows where users have started to watch the content
started_watching = user_data[user_data['started_to_watch'] == 'yes']

# Group by persona and genre, then count occurrences
genre_counts_per_persona = started_watching.groupby(['persona', 'genre']).size().reset_index(name='counts')

# Determine the most commonly watched genre per persona
most_watched_genre_per_persona = genre_counts_per_persona.loc[genre_counts_per_persona.groupby('persona')['counts'].idxmax()]

print(most_watched_genre_per_persona)
