In [2]:
import pandas as pd
import numpy as np
import os
import re
import json
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Pre-process Participants data
- this was already done for you, just FYI

In [14]:
df_participation = pd.read_csv("/home/jonas/EasyStudy/server/data/participation.csv", index_col=0)

# filter-out test users # df_participation = df_participation.loc[31:]
df_participation.participant_email = df_participation.participant_email.fillna("")
df_participation = df_participation[~df_participation.participant_email.str.contains("testuser")]
df_participation.shape

#filter-out unfinished participations

df_participation = df_participation[df_participation.age_group.notna()]
df_completed_participation = df_participation[df_participation.time_finished.notna()]
df_uncompleted_participation = df_participation[df_participation.time_finished.isna()]

#remove sensitive data and save for further usage

df_completed_participation = df_completed_participation.drop(["participant_email","extra_data"], axis=1)
df_completed_participation.to_csv("data/participation-export_filtered.csv")

# Pre-process Interactions
- this was already done for you, just FYI

In [24]:
df_interaction = pd.read_json("/home/jonas/EasyStudy/server/data/interaction.json", encoding='utf-8')

def get_participants_interaction(df_i, df_p):
    return df_i[df_i.participation.isin(df_p.index)]


#Only get interactions of non-dummy-data participants who completed the study
df_interaction = get_participants_interaction(df_interaction, df_completed_participation)
df_interaction.to_json("data/interaction-export_filtered.json")

#Toy dataset for faster downloads (using only 20 participants)
df_interactionTop20 = get_participants_interaction(df_interaction, df_completed_participation.iloc[0:20])
df_interactionTop20.to_json("data/interaction-export_filteredSmall.json")


# Load data

In [31]:
df_interaction = pd.read_json("data/interaction.json", encoding='utf-8')
df_completed_participation = pd.read_csv("data/participation-export_filtered.csv", index_col=0)

In [32]:
df_interaction.head()

Unnamed: 0,data,id,interaction_type,participation,time
0,"{""page"": ""preference_elicitation"", ""context"": ...",1,loaded-page,1,2024-09-26 15:09:05.237971
1,"{""selected_item"": {""movieName"": ""Harry Potter ...",2,selected-item,1,2024-09-26 15:09:09.296887
2,"{""viewport"": {""left"": 0, ""top"": 0, ""width"": 19...",3,changed-viewport,1,2024-09-26 15:09:10.233974
3,"{""selected_item"": {""movieName"": ""Harry Potter ...",4,selected-item,1,2024-09-26 15:09:11.198213
4,"{""selected_item"": {""movieName"": ""Django Unchai...",5,selected-item,1,2024-09-26 15:09:11.852801


In [33]:
df_interaction.interaction_type.unique()

array(['loaded-page', 'selected-item', 'changed-viewport', 'on-input',
       'deselected-item', 'elicitation-ended', 'iteration-started',
       'iteration-ended', 'study-ended'], dtype=object)

In [26]:
df_completed_participation.head()

Unnamed: 0_level_0,age_group,gender,education,ml_familiar,user_study_id,time_joined,time_finished,uuid,language
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,21,0,3,1,11,2024-09-26 15:08:56.374223,2024-09-26 15:13:50.953686,5YMDUaGyCBusEdyjyPk2tg,en
2,21,0,3,1,11,2024-09-26 15:14:25.298064,2024-09-26 15:29:35.875462,XypvuckxjjQHxQ5HwebVGw,en
3,21,1,3,0,11,2024-09-26 15:34:42.799116,2024-09-26 15:52:07.886884,VMZT0DrJeNrBTFlMisDl0w,en
4,29,1,3,0,11,2024-09-26 15:54:57.926254,2024-09-26 16:16:38.145021,XypvuckxjjQHxQ5HwebVGw,en
5,21,0,2,1,11,2024-09-26 16:20:21.194907,2024-09-26 16:27:32.496880,jzY69jDW-94cLARGgSseZg,en


# Enrich the interactions data frame
- retrieve necessary information from "data" payload, e.g., 
  - in what iteration we are 
  - what layout was used
  - ordering of algorithms

In [34]:
N_ITERATIONS = 8
def get_iteration(x):
    return json.loads(x)["iteration"]

In [37]:
#This cell takes up to a minute or two to complete
import json
def set_iteration(row):
    if row.interaction_type == "iteration-started" or row.interaction_type == "iteration-ended":
        row['iteration'] = json.loads(row.data)['iteration']
    else:
        row['iteration'] = None
    return row

def set_result_layout(row):
    if row.interaction_type == "iteration-started":
        row['result_layout'] = json.loads(row.data)['result_layout']
    else:
        row['result_layout'] = None
    return row

#'algorithm_assignment': {'0': {'algorithm': 'relevance_based',
#   'name': 'gamma',
#   'order': 1},
#  '1': {'algorithm': 'weighted_average', 'name': 'delta', 'order': 0}},

def set_mapping(row):
    if row.interaction_type == 'iteration-started':
        dat = json.loads(row.data)['algorithm_assignment'].values()
        for mapping in dat:
            row[mapping['name'].upper()] = mapping['order']
    else:
        row['JELSA'] = None
        row['JELSA_NOPOST'] = None
    return row



d = df_interaction.copy()
d = d.set_index("id")
d = d.apply(set_iteration, axis=1).apply(set_result_layout, axis=1).apply(set_mapping, axis=1)
d['iteration'] = d.groupby(['participation'], sort=False)['iteration'].apply(lambda x: x.ffill())
d['result_layout'] = d.groupby(['participation'], sort=False)['result_layout'].apply(lambda x: x.ffill())
d['JELSA'] = d.groupby(['participation'], sort=False)['JELSA'].apply(lambda x: x.ffill())
d['JELSA_NOPOST'] = d.groupby(['participation'], sort=False)['JELSA_NOPOST'].apply(lambda x: x.ffill())
d = d[d.iteration.notna()]

d.to_json("data/interaction-export_filteredEnriched.json")

In [38]:
d = pd.read_json("data/interaction-export_filteredEnriched.json", encoding='utf-8')

In [40]:
# how does the record with iteration-started looks like?
d.loc[112]

data                {"iteration": 4, "movies": {"JELSA": {"movies"...
interaction_type                                    iteration-started
participation                                                       1
time                                       2024-09-26 15:12:28.552925
iteration                                                           4
result_layout                                                    rows
JELSA                                                               0
JELSA_NOPOST                                                        1
Name: 112, dtype: object

In [41]:
json.loads(d.loc[112]["data"])

{'iteration': 4,
 'movies': {'JELSA': {'movies': [{'movie': 'Dear Zachary: A Letter to a Son About His Father (2008)',
     'url': '/static/datasets/ml-latest/img/65188.jpg',
     'movie_idx': '679',
     'movie_id': 65188,
     'genres': ['Documentary']},
    {'movie': 'Saw II (2005)',
     'url': '/static/datasets/ml-latest/img/39446.jpg',
     'movie_idx': '491',
     'movie_id': 39446,
     'genres': ['Horror', 'Thriller']},
    {'movie': 'Allied (2016)',
     'url': '/static/datasets/ml-latest/img/164981.jpg',
     'movie_idx': '1290',
     'movie_id': 164981,
     'genres': ['Action', 'Drama', 'Romance', 'Thriller', 'War']},
    {'movie': 'Alice Through the Looking Glass (2016)',
     'url': '/static/datasets/ml-latest/img/158813.jpg',
     'movie_idx': '1252',
     'movie_id': 158813,
     'genres': ['Adventure', 'Children', 'Fantasy']},
    {'movie': 'Man on Fire (2004)',
     'url': '/static/datasets/ml-latest/img/7445.jpg',
     'movie_idx': '403',
     'movie_id': 7445,
    

In [43]:
# how does the record with iteration-ended looks like?
d.loc[87]

data                {"iteration": 2, "selected": [[758, 1044, 518,...
interaction_type                                      iteration-ended
participation                                                       1
time                                       2024-09-26 15:11:46.120504
iteration                                                           2
result_layout                                                    rows
JELSA                                                               1
JELSA_NOPOST                                                        0
Name: 87, dtype: object

In [45]:
json.loads(d.loc[87]["data"])
# d.iloc[100:120]

{'iteration': 2,
 'selected': [[758,
   1044,
   518,
   958,
   1181,
   366,
   715,
   814,
   293,
   334,
   580,
   386,
   863],
  [499,
   895,
   1112,
   571,
   750,
   1029,
   1123,
   522,
   1028,
   468,
   1451,
   850,
   1471,
   1479,
   327,
   531]],
 'selected_variants': [[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]],
 'dont_like_anything': [False, False],
 'algorithm_comparison': ['third', 'fourth'],
 'ratings': [{'JELSA': 3, 'JELSA_NOPOST': 4}, {'JELSA': 4, 'JELSA_NOPOST': 4}]}

# Filter only the information about item selections
- alternatively, similar information (without the ordering of clicks) can be collected from iteration-ended

In [46]:
# adding information on whether the selected item was displayed on an advantaged position (variant=0), or not (variant=1)
d["variant"] = -1
print(d.shape)
d.loc[d["interaction_type"] == "selected-item", "variant"] = d[d["interaction_type"] == "selected-item"].data.map(lambda x: json.loads(x)["selected_item"]).map(lambda x: x.get("variant", -1))
d = d.loc[d.iteration <= 8]
print(d.shape)

(976, 9)
(976, 9)


In [47]:
selected_item_interactions = d[d.variant >= 0].copy()
selected_item_interactions.shape

(337, 9)

In [48]:
selected_item_interactions.head()

Unnamed: 0,data,interaction_type,participation,time,iteration,result_layout,JELSA,JELSA_NOPOST,variant
36,"{""selected_item"": {""genres"": [""Drama"", ""Myster...",selected-item,1,2024-09-26 15:10:24.800955,1,rows,0,1,0
37,"{""selected_item"": {""genres"": [""Action"", ""Adven...",selected-item,1,2024-09-26 15:10:28.982732,1,rows,0,1,0
39,"{""selected_item"": {""genres"": [""Action"", ""Sci-F...",selected-item,1,2024-09-26 15:10:32.642589,1,rows,0,1,0
41,"{""selected_item"": {""genres"": [""Adventure"", ""Fa...",selected-item,1,2024-09-26 15:10:35.657857,1,rows,0,1,1
42,"{""selected_item"": {""genres"": [""Fantasy""], ""mov...",selected-item,1,2024-09-26 15:10:36.030545,1,rows,0,1,1


In [50]:
# information available for selected-item interaction_type
json.loads(selected_item_interactions.loc[36]["data"])

{'selected_item': {'genres': ['Drama', 'Mystery', 'Thriller'],
  'movie': 'Shutter Island (2010) Drama|Mystery|Thriller',
  'movie_id': 74458,
  'movie_idx': '758',
  'url': '/static/datasets/ml-latest/img/74458.jpg',
  'variant': 0},
 'selected_items': [{'genres': ['Drama', 'Mystery', 'Thriller'],
   'movie': 'Shutter Island (2010) Drama|Mystery|Thriller',
   'movie_id': 74458,
   'movie_idx': '758',
   'url': '/static/datasets/ml-latest/img/74458.jpg',
   'variant': 0}],
 'context': {'url': 'https://31aa-87-52-108-14.ngrok-free.app/fastcompare/compare-algorithms',
  'time': '2024-09-26T15:10:24.770Z',
  'viewport': {'left': 0, 'top': 0, 'width': 1902, 'height': 1806.390625},
  'extra': {'variant': 0}}}

In [51]:
# adding information on corresponding MovieID
def getSelectedMovieId(x):
    return json.loads(x)["selected_item"]["movie_id"]

selected_item_interactions["movieID"] = np.nan
selected_item_interactions.movieID = selected_item_interactions.data.map(lambda x: getSelectedMovieId(x))

In [54]:
# adding information on which algorithm is responsible for the selection
selected_item_interactions["selected_algorithm"] = "JELSA"
selected_item_interactions.loc[selected_item_interactions.variant == selected_item_interactions.JELSA_NOPOST, "selected_algorithm"] = "JELSA_NOPOST"

In [55]:
selected_item_interactions.head()

Unnamed: 0,data,interaction_type,participation,time,iteration,result_layout,JELSA,JELSA_NOPOST,variant,movieID,selected_algorithm
36,"{""selected_item"": {""genres"": [""Drama"", ""Myster...",selected-item,1,2024-09-26 15:10:24.800955,1,rows,0,1,0,74458,JELSA
37,"{""selected_item"": {""genres"": [""Action"", ""Adven...",selected-item,1,2024-09-26 15:10:28.982732,1,rows,0,1,0,108932,JELSA
39,"{""selected_item"": {""genres"": [""Action"", ""Sci-F...",selected-item,1,2024-09-26 15:10:32.642589,1,rows,0,1,0,45499,JELSA
41,"{""selected_item"": {""genres"": [""Adventure"", ""Fa...",selected-item,1,2024-09-26 15:10:35.657857,1,rows,0,1,1,98809,JELSA_NOPOST
42,"{""selected_item"": {""genres"": [""Fantasy""], ""mov...",selected-item,1,2024-09-26 15:10:36.030545,1,rows,0,1,1,135143,JELSA_NOPOST


## Task 1: which algorithm (JELSA or JELSA_NOPOST) attracted more selections?
- apply a simple groupby and count
- what kind of test would you apply to determine whether the differences are statistically significant?

In [56]:
selected_item_interactions.groupby("selected_algorithm")[["movieID"]].count()

Unnamed: 0_level_0,movieID
selected_algorithm,Unnamed: 1_level_1
JELSA,127
JELSA_NOPOST,210


JELSA has 127 participantions, JELSA_NOPOST has 210 participantions.

I would apply a chi-square test to determine whether the differences are statistically significant.

### Task 1.1: were there some differences if the algorithm was displayed at (dis)advantaged position?

In [57]:
JELSA_adv = 2178
JELSA_disadv = 1827
JELSA_NOPOST_adv = 1644
JELSA_NOPOST_disadv = 1331

print("JELSA", JELSA_adv-JELSA_disadv)
print("JELSA_NOPOST", JELSA_NOPOST_adv-JELSA_NOPOST_disadv)

JELSA_picked_more_per_adv = (JELSA_adv - JELSA_disadv)/(JELSA_adv+JELSA_disadv)
JELSA_NOPOST_picked_more_per_adv = (JELSA_NOPOST_adv - JELSA_NOPOST_disadv)/(JELSA_NOPOST_adv+JELSA_NOPOST_disadv)

print("JELSA {:.2%}".format(JELSA_picked_more_per_adv))
print("JELSA_NOPOST {:.2%}".format(JELSA_NOPOST_picked_more_per_adv))

selected_item_interactions.groupby(["selected_algorithm","variant"])[["movieID"]].count()

JELSA 351
JELSA_NOPOST 313
JELSA 8.76%
JELSA_NOPOST 10.52%


Unnamed: 0_level_0,Unnamed: 1_level_0,movieID
selected_algorithm,variant,Unnamed: 2_level_1
JELSA,0,43
JELSA,1,84
JELSA_NOPOST,0,123
JELSA_NOPOST,1,87


It seems that JELSA was picked more often when it was displayed at the first position.
The same goes for JELSA_NOPOST. 

### Task 1.2: were there some differences w.r.t. result_layout and (dis)advantaged position?

In [58]:
selected_item_interactions.groupby(["selected_algorithm","result_layout", "variant"])[["movieID"]].count()
# Calculate the percentage of selected items per variant
selected_item_interactions.groupby(["selected_algorithm","result_layout", "variant"])[["movieID"]].count().groupby(level=[0,1]).apply(lambda x: x / x.sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,movieID
selected_algorithm,result_layout,variant,Unnamed: 3_level_1
JELSA,rows,0,0.338583
JELSA,rows,1,0.661417
JELSA_NOPOST,rows,0,0.585714
JELSA_NOPOST,rows,1,0.414286


## Task 2: what are average DCG scores for both GAMMA and DELTA?
- you gonna need to link selections to the positions of displayed items. This can be acquired from iteration-started interaction_type

In [59]:
# Create the pair iteration_started iteration_ended


df_interaction.groupby("interaction_type")[["id"]].count()
# Select where interaction_type is iteration-started and join by iteration_ended with same id

df_interaction.groupby("interaction_type")[["id"]].count()
# df_interaction.groupby(selected_item_interactions.selected_algorithm)
# df_interaction.groupby(["interaction_type"])[["id"]].count()
# df_interaction.groupby(["interaction_type" == "iteration-started"])[["id"]].count()


Unnamed: 0_level_0,id
interaction_type,Unnamed: 1_level_1
changed-viewport,449
deselected-item,15
elicitation-ended,7
iteration-ended,33
iteration-started,34
loaded-page,61
on-input,264
selected-item,446
study-ended,6


## Task 3: does JELSA or JELSA_NOPOST substantially differ in the diversity or novelty of provided recommendations?
- consider diversity w.r.t. movie genres
- consider mean distribution year as a proxy for novelty

In [62]:
selected_item_interactions.data.map(lambda x: json.loads(x)["selected_item"]["genres"]).groupby(selected_item_interactions.selected_algorithm).apply(lambda x: pd.Series(x.sum()).value_counts())

# It seems that JELSA (with postprocessing) is more evenly distributed over genres than JELSA_NOPOST. This makes sense, since the purpose postprocessing was diversification and to gain some novelty.

# Bot however does not seem to show Documentaries much, but this might be due to the users not selecting them. Since the amount of users tested is fairly low, this might be a coincidence.

selected_algorithm             
JELSA               Action          55
                    Drama           51
                    Adventure       45
                    Sci-Fi          39
                    Comedy          35
                    Thriller        34
                    Fantasy         21
                    Mystery         15
                    Romance         14
                    Crime           14
                    IMAX            14
                    Animation       12
                    Horror           8
                    Children         7
                    War              6
                    Documentary      2
                    Musical          1
                    Western          1
JELSA_NOPOST        Adventure      102
                    Drama           89
                    Action          83
                    Thriller        53
                    Comedy          53
                    Romance         51
                    Sci-Fi      

In [64]:
# If we calculate percentage of selected items per genre, we can compare the distribution for each algorithm

grouped_counts = selected_item_interactions.data.map(lambda x: json.loads(x)["selected_item"]["genres"]).groupby(selected_item_interactions.selected_algorithm).apply(lambda x: pd.Series(x.sum()).value_counts())

grouped_counts["JELSA"] = grouped_counts["JELSA"] / grouped_counts["JELSA"].sum() * 100

grouped_counts["JELSA_NOPOST"] = grouped_counts["JELSA_NOPOST"] / grouped_counts["JELSA_NOPOST"].sum() * 100

grouped_counts

# It seems that borh algorithms has a higher bias towards selecting movies from the most popular genres, such as Action, Drama, Adventure,


selected_algorithm             
JELSA               Action         14.705882
                    Drama          13.636364
                    Adventure      12.032086
                    Sci-Fi         10.427807
                    Comedy          9.358289
                    Thriller        9.090909
                    Fantasy         5.614973
                    Mystery         4.010695
                    Romance         3.743316
                    Crime           3.743316
                    IMAX            3.743316
                    Animation       3.208556
                    Horror          2.139037
                    Children        1.871658
                    War             1.604278
                    Documentary     0.534759
                    Musical         0.267380
                    Western         0.267380
JELSA_NOPOST        Adventure      14.739884
                    Drama          12.861272
                    Action         11.994220
                    Thr

In [65]:
# Subtract the two series to get the relative difference in percentage points
grouped_counts["JELSA"] - grouped_counts["JELSA_NOPOST"]


Action         2.711663
Adventure     -2.707799
Animation     -1.415721
Children      -3.330654
Comedy         1.699329
Crime          0.130599
Documentary    0.390251
Drama          0.775092
Fantasy       -1.321443
Film-Noir           NaN
Horror         1.705511
IMAX          -0.880962
Musical       -0.744181
Mystery        1.698556
Romance       -3.626627
Sci-Fi         3.057865
Thriller       1.431950
War            0.303700
Western             NaN
Name: data, dtype: float64

In [66]:
# This is a list of all the years that the selected movies were released
years = selected_item_interactions.data.map(lambda x: int(re.findall(r'\(([0-9]+)\)', json.loads(x)["selected_item"]["movie"])[0])).groupby(selected_item_interactions.selected_algorithm)

# Mean year of all selections
mean_year = years.mean()

print(mean_year) # 2012
# The mean year of all selected movies recommended by the JELSA algorithm is ~1997
# The mean year of all selected movies recommended by the JELSA_NOPOST algorithm is ~2006, so a bit newer movies.
# This makes sense, since JELSA_NOPOST is expected to have a larger popularity bias


selected_algorithm
JELSA           1997.000000
JELSA_NOPOST    2006.209524
Name: data, dtype: float64
