In [1]:
from random import randrange, choices, seed, randint
import pandas as pd
import time
import os

In [2]:
pd.set_option('display.max_columns', None)
current_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
content = pd.read_csv( current_path + "/scrapping/Final_NPO_Data.csv")

# Format to content name
content.loc[content["Category"] == "Documentaires" ,  "Category"] = "Documentary"
content.loc[content["Category"] == "Films",  "Category"] = "Movie"
content.loc[content["Category"] == "Programmas",  "Category"] = "Tv"

# Renaming column
content = content.rename({"Category":"Type"}, axis = 1)

# Drop titles tha are repeated
content = content.drop_duplicates(subset='Name', keep="first")
content.reset_index(inplace = True, drop = True)

# Create item id
content.insert(0, 'item_id', content.index)

# Save clean data to csv
content.to_csv(current_path + "/content_clean_new.csv",index = False)

display(content)

Unnamed: 0,item_id,Name,Description,Type,URL,Small_Image,Large_Image,Tags
0,0,Wallace & Gromit,Engelse animatiefilm. Wallace en Gromit runnen...,Movie,https://www.npostart.nl/wallace-gromit/21-12-2...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Animatie', 'Komedie']"
1,1,Sissi,Op 16-jarige leeftijd ontmoet Sissi de keizer ...,Movie,https://www.npostart.nl/sissi/25-12-2008/POW_0...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Drama', 'Romantiek']"
2,2,"Sissi, die junge Kaiserin",Sissi heeft het onbezorgde leventje moeten inr...,Movie,https://www.npostart.nl/sissi-die-junge-kaiser...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Drama', 'Historisch']"
3,3,Charlie & Lola specials,Specials over de avonturen van Charlie en Lola.,Movie,https://www.npostart.nl/charlie-lola-specials/...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Misdaad', 'Drama']"
4,4,"Sissi, Schicksalsjahre einer Kaiserin",Franz Joseph en Sissi zijn noodgedwongen regel...,Movie,https://www.npostart.nl/sissi-schicksalsjahre-...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Drama', 'Historisch']"
...,...,...,...,...,...,...,...,...
1492,1492,Trom,Journalist Hannis Martinsson krijgt een myster...,Tv,https://www.npostart.nl/trom/POW_05198986,https://images.npo.nl/tile/320x180/Trom_cdn_ti...,https://images.npo.nl/tile/320x180/Trom_cdn_ti...,"['Drama', 'Misdaad']"
1493,1493,BinnensteBuiten,BinnensteBuiten trakteert je dagelijks op een ...,Tv,https://www.npostart.nl/binnenstebuiten/KN_167...,https://images.npo.nl/tile/320x180/Binnenstebu...,https://images.npo.nl/tile/320x180/Binnenstebu...,"['Animatie', 'Komedie']"
1494,1494,NOS Jeugdjournaal,Het laatste nieuws uit binnen- en buitenland e...,Tv,https://www.npostart.nl/nos-jeugdjournaal/NOSJ...,https://images.npo.nl/tile/320x180/jeugdjourna...,https://images.npo.nl/tile/320x180/jeugdjourna...,"['Drama', 'Romantiek']"
1495,1495,Fight or Flight,Sahil Amar Aïssa portretteert twee mensen die ...,Tv,https://www.npostart.nl/fight-or-flight/BV_101...,https://images.npo.nl/tile/320x180/1903091.jpg,https://images.npo.nl/tile/320x180/1903091.jpg,"['Actie', 'Thriller']"


In [3]:
# Defining random date generation function
def randomize_time(start_timestamp,end_timestamp):
    return time.strftime('%b %d %Y %I:%M', time.localtime(randrange(start_timestamp,end_timestamp)))

In [4]:
# Prototype dataframe
seed(10)

#Time range of generation
start_timestamp = time.mktime(time.strptime('Mar 21 2022  01:33', '%b %d %Y %I:%M'))
end_timestamp = time.mktime(time.strptime('Mar 21 2023  01:33', '%b %d %Y %I:%M'))

"""
According to our survey, we identified the following personas:
    1. Documentary Lover [50]
    2. Series Streamer [100]
    3. Content (TV) devourer [50]
    4. Movie Buff [100]

This code generates synthetic data for a recommendation system. The script creates the four user personas based on the survey, 
and generates a total of 1000 users. The script then generates synthetic data for each user based on their user type and preferences, 
including information about the content they interacted with, such as whether they viewed or previewed it, and what rating and sharing values 
they assigned to it. The final output is a Pandas DataFrame called full_data that contains the synthetic data for all the generated users. 
The data can be used to train a recommendation system to suggest content based on user preferences.
"""

#Total number of users to generate
num_users, user_ids, user_types = 1000, [], []

#Generate users
user_map = {
    "Documentary Lover": ("Documentary", (1, 1, 1, 3)),
    "Series Streamer": ("Series", (1, 1, 3, 1)),
    "Content devourer": ("Tv", (3, 1, 1, 1)),
    "Movie Buff": ("Movie", (1, 3, 1, 1))
}

typeList = ["Tv", "Movie", "Series", "Documentary"]

users_types = choices(list(user_map.keys()), weights=(1, 2, 1, 2), k=num_users)

#full data container
full_data = pd.DataFrame()

for user_id, user in enumerate(users_types):
    #Setting user preferences
    fav, w = user_map[user]
    
    # Creating synthetic data for the current user
    noise = randint(0, 100)
    types = choices(typeList, weights= w, k= 50 + noise)

    data_user = pd.concat([
        pd.DataFrame({
            "user_id": user_id,
            "user_type": user,
            "date": randomize_time(start_timestamp, end_timestamp), # Create random time stamp
            "view": choices([0, 1], weights=(1, 4) if i == fav else (1, 3), k=1)[0], # Set the view value for the item depending on the user's preferences
            "prev": choices([0, 1], weights=(1, 2) if i == fav else (1, 1), k=1)[0], # Set the preview value for the item depending on the user's preferences
            **content.loc[content["Type"] == i].sample().to_dict("records")[0] # Save information about the item that the user "interacted" with
        }, index=[0])
    for i in types], axis=0)

    data_user["rating"] = data_user.apply(lambda x: 
                                            choices([-1, 0, 1],
                                            weights=(0, 1, 0) if x["prev"] == 0 else # Set the rating value to 0 if the user didn't previewed the item
                                                    (1, 4, 2) if x["Type"] == fav else # Set the rating value to 0 or 1 depending on the user's preferences, if the item is part of the favourite category of the user, the probability of being liked is higher
                                                    (1, 5, 1), k=1)[0], axis = 1) # Set the rating value to 0 or 1 depending on the user's preferences, if the item is not part of the favourite category of the user, the probability of being liked is lower
    
    data_user["shared"] = data_user.apply(lambda x: 
                                            choices([0, 1],
                                            weights=(1, 0) if x["prev"] == 0 else # Set the shared value to 0 if the user didn't previewed the item
                                            (1, 2) if x["Type"] == fav else # Set the shared value to 0 or 1 depending on the user's preferences, if the item is part of the favourite category of the user, the probability of being shared is higher
                                            (2, 4) if x["rating"] == 1 else # Set the shared value to 0 or 1 depending on the rating, if the item was "liked" by the user, the probability of being shared is higher
                                            (20, 1) if x["rating"] == -1 else # Set the shared value to 0 or 1 depending on the rating, if the item was "disliked" by the user, the probability of being shared is lower
                                            (1, 3), k=1)[0], axis = 1)

    full_data = pd.concat([full_data, data_user], axis=0)


In [5]:
cols = ["user_id", "user_type", "date", "view", "prev", "shared", "rating", "item_id", "Name", "Description", "Type", "URL", "Small_Image", "Large_Image", "Tags"]
full_data = full_data[cols]
full_data

Unnamed: 0,user_id,user_type,date,view,prev,shared,rating,item_id,Name,Description,Type,URL,Small_Image,Large_Image,Tags
0,0,Content devourer,Apr 08 2022 08:02,1,1,0,1,998,"Vrijheid, gelijkheid en broederschap","Komt ons denken over vrijheid, gelijkheid en b...",Tv,https://www.npostart.nl/vrijheid-gelijkheid-en...,https://images.npo.nl/tile/320x180/1055787.jpg,https://images.npo.nl/tile/320x180/1055787.jpg,"['Drama', 'Romantiek']"
0,0,Content devourer,May 14 2022 10:20,1,0,0,0,643,Kasper en de kerstengelen,Kasper wil astronaut worden. Wat hem nu dwars ...,Series,https://www.npostart.nl/kasper-en-de-kerstenge...,https://images.npo.nl/tile/320x180/841054.jpg,https://images.npo.nl/tile/320x180/841054.jpg,"['Animatie', 'Familie']"
0,0,Content devourer,Apr 18 2022 12:23,1,1,1,-1,1341,Musical Awards Gala,Liveshow waarin de belangrijkste prijzen op mu...,Tv,https://www.npostart.nl/musical-awards-gala/PO...,https://images.npo.nl/tile/320x180/mag_bckgrnd...,https://images.npo.nl/tile/320x180/mag_bckgrnd...,"['Familie', 'Muziek']"
0,0,Content devourer,Sep 29 2022 12:20,0,0,0,0,161,Zappbios: Mijn eigen circus,Laura trekt al haar hele leven rond met haar v...,Movie,https://www.npostart.nl/zappbios-mijn-eigen-ci...,https://images.npo.nl/tile/320x180/1569490.jpg,https://images.npo.nl/tile/320x180/1569490.jpg,['Komedie']
0,0,Content devourer,Jan 16 2023 07:17,1,0,0,0,1378,Fake It Till You Make it,In 'Fake It Till You Make It' van Het Klokhuis...,Tv,https://www.npostart.nl/fake-it-till-you-make-...,https://images.npo.nl/tile/320x180/Fake_it_til...,https://images.npo.nl/tile/320x180/Fake_it_til...,"['Horror', 'Sciencefiction']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,999,Series Streamer,Dec 31 2022 10:48,1,0,0,0,137,Tully,Met drie kinderen wordt het Marlo allemaal wat...,Movie,https://www.npostart.nl/tully/03-07-2020/POW_0...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Drama', 'Komedie']"
0,999,Series Streamer,Aug 08 2022 05:18,0,0,0,0,217,Radeloos,Als er plotseling iets verschrikkelijks gebeur...,Movie,https://www.npostart.nl/radeloos/19-03-2022/KN...,https://images.npo.nl/tile/320x180/1680822.jpg,https://images.npo.nl/tile/320x180/1680822.jpg,['Drama']
0,999,Series Streamer,Jul 03 2022 09:01,1,1,0,-1,496,The Princess,Confronterende kroniek over het publieke leven...,Documentary,https://www.npostart.nl/the-princess/17-01-202...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Actie', 'Fantasy']"
0,999,Series Streamer,Jun 18 2022 04:36,0,1,0,0,129,Boy meets gun,Telefilm over de heftige relatie tussen een do...,Movie,https://www.npostart.nl/boy-meets-gun/05-02-20...,https://weserv.moviemeter.nl/?url=https://www....,https://weserv.moviemeter.nl/?url=https://www....,"['Komedie', 'Thriller']"


In [6]:
full_data.groupby('user_type')['user_id'].apply(lambda x: list(x.unique())).reset_index()

Unnamed: 0,user_type,user_id
0,Content devourer,"[0, 2, 6, 8, 15, 21, 26, 35, 38, 46, 47, 50, 5..."
1,Documentary Lover,"[7, 13, 22, 27, 28, 29, 37, 40, 43, 49, 53, 55..."
2,Movie Buff,"[4, 5, 11, 12, 14, 18, 20, 23, 24, 25, 30, 33,..."
3,Series Streamer,"[1, 3, 9, 10, 16, 17, 19, 31, 32, 34, 36, 41, ..."


In [10]:
# Check statistics for each item
full_data.groupby("item_id", as_index=False)[["Name", "view", "prev", "shared", "rating"]].mean()

Unnamed: 0,item_id,view,prev,shared,rating
0,0,0.804878,0.597561,0.390244,0.121951
1,1,0.714286,0.571429,0.329670,0.010989
2,2,0.762376,0.584158,0.376238,0.059406
3,3,0.824176,0.659341,0.395604,0.153846
4,4,0.839080,0.517241,0.356322,0.057471
...,...,...,...,...,...
1492,1492,0.735294,0.470588,0.411765,0.058824
1493,1493,0.590909,0.500000,0.227273,-0.090909
1494,1494,0.812500,0.625000,0.312500,0.156250
1495,1495,0.789474,0.552632,0.421053,0.000000


In [8]:
# Save synthetic data to csv
full_data.to_csv(current_path + "/full_data.csv", index = False)