In [None]:
import logging
from datetime import datetime

current_file_name = "13_Mouse_Data_Preparation"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from scipy.interpolate import interp1d
import plotly.express as px

In [None]:
from helpers.questions import *
from helpers.constants import *
from helpers.pages import *

In [None]:
pd.set_option('display.max_columns', 500)

## Page processing

In [None]:
def get_trajecotires_from_csv(csv_df, pages):
    page_names = pages.keys()

    csv_dict = {}

    for page_name in page_names:
        page = csv_df[csv_df["page_name"] == page_name]
        csv_dict[page_name] = page

    return csv_dict

In [None]:
def exctract_trajectories_from_interactions(path, pages):
    # Traverse through all files in the directory
    folders = os.listdir(path)
    
    interactions_dict = {}

    for folder in folders:
        files = os.listdir(path + "\\" + folder)
        for file in files:
            if file.endswith(".csv"):
                csv_df = pd.read_csv(path + "\\" + folder + "\\" + file)

                # Get number from the folder name
                number = folder.split("_")[1]
                answers_dict = get_trajecotires_from_csv(csv_df, pages)
                interactions_dict[number] = answers_dict
    return interactions_dict
                

In [None]:
path_fg = 'data\\3_UXtweak_Mouse_Data_Processing\\FG'
path_h = 'data\\3_UXtweak_Mouse_Data_Processing\\H'

extracted_fg = exctract_trajectories_from_interactions(path_fg, pages)
extracted_fg["group"] = "FG"

extracted_h = exctract_trajectories_from_interactions(path_h, pages)
extracted_h["group"] = "H"

In [None]:
normal_width = 1920
normal_height = 1080

In [None]:
def get_widt_and_height(page_df):
    return page_df["pageview_screenWidth"].iloc[0], page_df["pageview_screenHeight"].iloc[0]

In [None]:
def check_scrolling(page_df):
    different = False
    
    # Compare clientX with x and clientY with y
    for index, row in page_df.iterrows():
        if row["clientX"] != row["x"]:
            different = True
        if row["clientY"] != row["y"]:
            different = True

    return different

In [None]:
def restart_time(page_df):
    # Trajectory starts with first move

    start_time = page_df["accurate_timestamp"].min()
    
    page_df.loc[:, "accurate_timestamp"] = page_df["accurate_timestamp"] - start_time
    return page_df

In [None]:
def normalize_trajectory(page_df, width, height, normal_width, normal_height):
    page_df.loc[:, "clientX"] = page_df["clientX"] / width * normal_width
    page_df.loc[:, "clientY"] = (height - page_df["clientY"]) / height * normal_height
    return page_df

In [None]:
def get_trajectory_from_page(page_df):
    page_df = page_df[["clientX", "clientY", "accurate_timestamp", "type", "text"]]
    return page_df

In [None]:
def process_page(page, normal_width, normal_height):
    page_df = page.copy(deep=True)
    width, height = get_widt_and_height(page_df)
    if check_scrolling(page_df):
        print("Scrolling")
    page_df = restart_time(page_df)
    page_df = normalize_trajectory(page_df, width, height, normal_width, normal_height)
    page_df = get_trajectory_from_page(page_df)
    return page_df

In [None]:
page = extracted_fg["26"]["page_5"]
page_df = process_page(page, normal_width, normal_height)
page_df

In [None]:
# Calculate average difference between timestamps in ms
page_df["accurate_timestamp"].diff().mean()

## Metrics

In [None]:
def calculate_distance(df):
    page_df = df.copy(deep=True)
    
    page_df["distance"] = np.sqrt((page_df["clientX"] - page_df["clientX"].shift())**2 + (page_df["clientY"] - page_df["clientY"].shift())**2)
    return page_df["distance"].sum().round(2)

In [None]:
def calculate_axis_distance(df, axis):
    page_df = df.copy(deep=True)

    page_df["distance"] = np.abs(page_df[axis] - page_df[axis].shift())
    return page_df["distance"].sum().round(2)

In [None]:
def calculate_flips(df, axis):
    page_df = df.copy(deep=True)
    # Check for direction change

    page_df["direction"] = np.sign(page_df[axis] - page_df[axis].shift())

    # Remove rows without change in direction 
    page_df = page_df[page_df["direction"] != 0]

    page_df["flips"] = page_df["direction"] != page_df["direction"].shift()

    # Unable to get direction of first coordinate, direction is based on the previous coorditate, in this case it does not exist
    # Unable to get flip value of second coordinate, because it is calculated based on change between directions of first and second coordinates
    # First 2 flip values are true, so they need to be discarded
    return page_df["flips"].sum() - 2


In [None]:
def _ideal_trajectory_coordinates(df):
    page_df = df.copy(deep=True)

    first_move = page_df[page_df["type"] == "move"].iloc[0]
    first_x = first_move["clientX"]
    first_y = first_move["clientY"]

    last_answer = page_df[page_df["text"].notna() & (page_df["text"] != "Next")]
    last_x = last_answer["clientX"].iloc[0]
    last_y = last_answer["clientY"].iloc[0]
    last_time = last_answer["accurate_timestamp"].iloc[0]

    return first_x, first_y, last_x, last_y, last_time

In [None]:
def ideal_trajectory_length(df):
    page_df = df.copy(deep=True)

    first_x, first_y, last_x, last_y, last_time = _ideal_trajectory_coordinates(page_df)
    return np.sqrt((last_x - first_x)**2 + (last_y - first_y)**2).round(2)

In [None]:
def max_deviation(df):
    page_df = df.copy(deep=True)
    
    first_x, first_y, last_x, last_y, last_time = _ideal_trajectory_coordinates(page_df)
    # Only consider time up to the last answer
    page_df = page_df[page_df["accurate_timestamp"] <= last_time]

    # https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line#Line_defined_by_two_points
    page_df.loc[:, "deviation"] = np.abs((last_x - first_x) * (page_df["clientY"] - first_y) - (last_y - first_y) * (page_df["clientX"] - first_x) ) / np.sqrt((last_y - first_y)**2 + (last_x - first_x)**2)

    return page_df["deviation"].max().round(2)

In [None]:
def area_under_curve(df, mode, test=False):
    page_df = df.copy(deep=True)

    if not test:
        first_x, first_y, last_x, last_y, last_time = _ideal_trajectory_coordinates(page_df)
        page_df = page_df[page_df["accurate_timestamp"] <= last_time]

    # https://en.wikipedia.org/wiki/Trapezoidal_rule

    if mode == "real":
        page_df.loc[:, "area"] = 0.5 * (page_df["clientX"] - page_df["clientX"].shift()) * (page_df["clientY"] + page_df["clientY"].shift())
        return page_df["area"].sum().round(2)
    
    if mode == "ideal":
        return (0.5 * (last_x - first_x) * (last_y + first_y)).round(2)
    

In [None]:
# https://www.emathhelp.net/en/calculators/calculus-2/trapezoidal-rule-calculator-for-a-table/?i=%5B%5B1%2C6%2C3%2C10%2C12%2C2%5D%2C%5B15%2C5%2C3%2C1%2C47%2C6%5D%5D

test_auc = pd.DataFrame({"clientX": [1, 2, 3, 6, 10, 12], "clientY": [15, 6, 3, 5, 1, 47]})
area_under_curve(test_auc, mode="real", test=True) # 87.0

In [None]:
def get_page_metrics(page_df):
    metrics = {}

    # Duration of the page
    metrics["total_duration"] = page_df["accurate_timestamp"].iloc[-1]

    # Time until first movement
    metrics["init_time"] = page_df[page_df["type"] == "move"]["accurate_timestamp"].iloc[0]
    
    # Time until final decision (text is not NaN, text is no Next)
    metrics["react_time"] = page_df[page_df["text"].notna() & (page_df["text"] != "Next")]["accurate_timestamp"].iloc[0]

    # Number of clicks
    metrics["number_of_clicks"] = page_df[page_df["type"] == "click"].shape[0]

    # Number of x-flips
    metrics["number_of_x_flips"] = calculate_flips(page_df, "clientX")

    # Number of y-flips
    metrics["number_of_y_flips"] = calculate_flips(page_df, "clientY")

    # Distance of the mouse movement
    metrics["distance"] = calculate_distance(page_df)

    # Distance of the mouse movement on the x axis
    metrics["distance_x"] = calculate_axis_distance(page_df, "clientX")

    # Distance of the mouse movement on the y axis
    metrics["distance_y"] = calculate_axis_distance(page_df, "clientY")

    # Speed of the mouse movement
    metrics["speed"] = metrics["distance"] / metrics["total_duration"]

    # Speed of the mouse movement on the x axis
    metrics["speed_x"] = metrics["distance_x"] / metrics["total_duration"]

    # Speed of the mouse movement on the y axis
    metrics["speed_y"] = metrics["distance_y"] / metrics["total_duration"]

    # Acceleration of the mouse movement
    metrics["acceleration"] = metrics["speed"] / metrics["total_duration"]

    # Acceleration of the mouse movement on the x axis
    metrics["acceleration_x"] = metrics["speed_x"] / metrics["total_duration"]

    # Acceleration of the mouse movement on the y axis
    metrics["acceleration_y"] = metrics["speed_y"] / metrics["total_duration"]

    # Ideal trajectory length (straight line between first movement and final decision)
    metrics["ideal_trajectory_length"] = ideal_trajectory_length(page_df)

    # Max deviation from the ideal trajectory
    metrics["max_deviation"] = max_deviation(page_df)

    # Area under the real curve
    metrics["area_under_real_curve"] = area_under_curve(page_df, mode="real")

    # Area under the optimal curve
    metrics["area_under_ideal_curve"] = area_under_curve(page_df, mode="ideal")

    # Difference between the two areas
    metrics["area_difference"] = (metrics["area_under_real_curve"] - metrics["area_under_ideal_curve"]).round(2)

    return metrics

In [None]:
page_df

In [None]:
get_page_metrics(page_df)

# LSTM Preprocessing

In [None]:
def interpolate_trajectory(df, interval=10):
    page_df = df.copy(deep=True)

    new_timestamps = np.arange(page_df['accurate_timestamp'].min(), page_df['accurate_timestamp'].max(), interval)

    # Create interpolation functions for x and y
    f_x = interp1d(page_df['accurate_timestamp'], page_df['clientX'], kind='linear', fill_value="extrapolate")
    f_y = interp1d(page_df['accurate_timestamp'], page_df['clientY'], kind='linear', fill_value="extrapolate")

    # Apply functions to interpolate at new timestamps
    interpolated_x = f_x(new_timestamps)
    interpolated_y = f_y(new_timestamps)

    # Round interpolated values to integers
    interpolated_x = np.round(interpolated_x).astype(int)
    interpolated_y = np.round(interpolated_y).astype(int)

    # Create a new DataFrame with interpolated values
    interpolated_data = pd.DataFrame({
        'interpolated_timestamp': new_timestamps,
        'seconds': new_timestamps / 1000,
        'x': interpolated_x,
        'y': interpolated_y
    })

    return interpolated_data

In [None]:
interpolated_trajectory = interpolate_trajectory(page_df, interval=25)
interpolated_trajectory

In [None]:
# Make animation
fig = px.scatter(interpolated_trajectory, x='x', y='y', animation_frame='seconds', range_x=[0, normal_width], range_y=[0, normal_height])
fig.show()

In [None]:
def calculate_deltas(df):
    page_df = df.copy(deep=True)

    page_df['delta_x'] = df['x'].diff()
    page_df['delta_y'] = df['y'].diff()

    # Handle the NaN values that appear in the first row from the diff operation
    page_df = page_df.bfill()

    # Convert to integers
    page_df['delta_x'] = page_df['delta_x'].astype(int)
    page_df['delta_y'] = page_df['delta_y'].astype(int)

    return page_df

In [None]:
calculate_deltas(interpolated_trajectory)

## Processing

In [None]:
question_pages = []
question_indices = []

start = 2
end = 14
counter = 0

for index, page_name in enumerate(pages.keys()):
    if index + 1 in range(start, end):
        counter += 1
        question_pages.append(page_name)
        question_indices.append(index)
    if counter == 12:
        start += 17
        end += 17
        counter = 0
    if start > 80:
        break

print(question_pages)
print(len(question_pages))

In [None]:
def create_dataset(dict_events, variant, normal_width, normal_height, question_pages):
    # Disable SettingWithCopyWarning
    pd.options.mode.chained_assignment = None
    
    output_metrics_list = []
    output_trajectories_df = pd.DataFrame()

    for respondent, pages in dict_events.items():
        if type(pages) == str:
            continue
        for page_name, page_df in pages.items():
            if page_name not in question_pages:
                continue
            if page_df.empty:
                continue
            page_df = process_page(page_df, normal_width, normal_height)
            metrics = get_page_metrics(page_df)
            metrics["respondent"] = f"respondent_{respondent}"
            metrics["page_name"] = page_name
            metrics["variant"] = variant

            output_metrics_list.append(metrics)
            logging.info(f"Metrics processed for Participant {respondent} Page {page_name} Group {variant}")

            interpolated_trajectory = interpolate_trajectory(page_df, interval=25)
            interpolated_trajectory = calculate_deltas(interpolated_trajectory)
            interpolated_trajectory["respondent"] = f"respondent_{respondent}"
            interpolated_trajectory["page_name"] = page_name
            interpolated_trajectory["variant"] = variant

            output_trajectories_df = pd.concat([output_trajectories_df, interpolated_trajectory], ignore_index=True)
            logging.info(f"Trajectory interpolated for Participant {respondent} Page {page_name} Group {variant}")

    return pd.DataFrame(output_metrics_list), output_trajectories_df

In [None]:
processed_metrics_fg, processed_trajectories_fg = create_dataset(extracted_fg, "FG", normal_width, normal_height, question_pages)

In [None]:
processed_metrics_h, processed_trajectories_h = create_dataset(extracted_h, "H", normal_width, normal_height, question_pages)

In [None]:
processed_metrics_fg.head()

In [None]:
processed_trajectories_fg.head()

## Gender

In [None]:
fg_pre_study_questions_path = wd + "\\2 UXtweak CSVs\\[DP Lies] Final 1 FG\\[DP Lies] Final 1 FG - Pre-study questionnaire.csv"
h_pre_study_questions_path = wd + "\\2 UXtweak CSVs\\[DP Lies] Final 1 H\\[DP Lies] Final 1 H - Pre-study questionnaire.csv"
fg_pre_study_questions = pd.read_csv(fg_pre_study_questions_path)
h_pre_study_questions = pd.read_csv(h_pre_study_questions_path)

fg_pre_study_questions_path_pilot = wd_pilot + "\\2 UXtweak CSVs\\Pilot Demo 4 FG\\Pilot Demo 4 FG - Pre-study questionnaire.csv"
h_pre_study_questions_path_pilot = wd_pilot + "\\2 UXtweak CSVs\\Pilot Demo 4 H\\Pilot Demo 4 H - Pre-study questionnaire.csv"
fg_pre_study_questions_pilot = pd.read_csv(fg_pre_study_questions_path_pilot)
h_pre_study_questions_pilot = pd.read_csv(h_pre_study_questions_path_pilot)

fg_pre_study_questions = pd.concat([fg_pre_study_questions, fg_pre_study_questions_pilot])
h_pre_study_questions = pd.concat([h_pre_study_questions, h_pre_study_questions_pilot])

In [None]:
print(fg_pre_study_questions["Q1: What gender do you identify as?"].unique())
print(h_pre_study_questions["Q1: What gender do you identify as?"].unique())

In [None]:
def check_if_female(pre_study_questions):
    pre_study_questions["female"] = pre_study_questions["Q1: What gender do you identify as?"] == "Female"
    pre_study_questions = pre_study_questions[["respondent", "female"]]
    pre_study_questions.rename(columns={"respondent": "respondent_num"}, inplace=True)
    pre_study_questions["respondent"] = pre_study_questions["respondent_num"].apply(lambda x: "respondent_" + str(x))
    return pre_study_questions

In [None]:
fg_female = check_if_female(fg_pre_study_questions)
h_female = check_if_female(h_pre_study_questions)

In [None]:
def add_female_to_df(df, female_df):
    # Add the same value of female to all pages of the same respondent, merge many to one
    merged_df = pd.merge(df, female_df, on='respondent', how='left')

    return merged_df

In [None]:
gendered_metrics_fg = add_female_to_df(processed_metrics_fg, fg_female)
gendered_metrics_h = add_female_to_df(processed_metrics_h, h_female)

## Remove unifnished questionnaires

In [None]:
# If number of pages for each respondent is not 60, remove them
fg_respondents_counts = gendered_metrics_fg["respondent"].value_counts()
fg_respondents_to_drop = fg_respondents_counts[fg_respondents_counts != 60].index.values.tolist()
fg_respondents_to_drop

In [None]:
# Remove unfinished questionnaires from fg
fg_metrics_len = len(gendered_metrics_fg)
fg_trajectories_len = len(processed_trajectories_fg)

gendered_metrics_fg = gendered_metrics_fg[~gendered_metrics_fg["respondent"].isin(fg_respondents_to_drop)]
processed_trajectories_fg = processed_trajectories_fg[~processed_trajectories_fg["respondent"].isin(fg_respondents_to_drop)]

print("gendered_metrics_fg difference", fg_metrics_len - len(gendered_metrics_fg))
print("processed_trajectories_fg difference", fg_trajectories_len - len(processed_trajectories_fg))

In [None]:
# If number of pages for each respondent is not 60, remove them
h_respondents_counts = gendered_metrics_h["respondent"].value_counts()
h_respondents_to_drop = h_respondents_counts[h_respondents_counts != 60].index.values.tolist()
h_respondents_to_drop

In [None]:
# Remove unfinished questionnaires from h
h_metrics_len = len(gendered_metrics_h)
h_trajectories_len = len(processed_trajectories_h)

gendered_metrics_h = gendered_metrics_h[~gendered_metrics_h["respondent"].isin(h_respondents_to_drop)]
processed_trajectories_h = processed_trajectories_h[~processed_trajectories_h["respondent"].isin(h_respondents_to_drop)]

print("gendered_metrics_h difference", h_metrics_len - len(gendered_metrics_h))
print("processed_trajectories_h difference", h_trajectories_len - len(processed_trajectories_h))

## Merge dataframes

In [None]:
# Merge the two dataframes with metrics
merged_metrics_dataframes = pd.concat([gendered_metrics_fg, gendered_metrics_h])
len(merged_metrics_dataframes)

In [None]:
# Merge the two dataframes with trajectories
merged_trajectories_dataframes = pd.concat([processed_trajectories_fg, processed_trajectories_h])
len(merged_trajectories_dataframes)

## Get Ground Truth

In [None]:
question_names = [f"question_{x}" for x in range(1, len(question_indices) + 1)]
answer_column = [f"rbfi{x}" if x in glob_reversed_questions else f"bfi{x}" for x in range(1, len(question_indices) + 1)]

print(len(glob_big5_questions), len(question_names), len(answer_column))
questions_dict_answers = {f"page_{question_indices[i] + 1}": (answer_column[i], glob_big5_questions[i]) for i in range(len(question_indices))}

questions_dict_answers["page_81"]

In [None]:
questions_dict_answers_gt = {}

for key, value in questions_dict_answers.items():
    questions_dict_answers_gt[key] = value[0] + "_gt"

questions_dict_answers_gt["page_81"]

In [None]:
elaborations_dict_reversed = {value: key for key, value in questions_dict_answers_gt.items()}
elaborations_dict_reversed["bfi60_gt"]

In [None]:
ground_truth_columns = [value for key, value in questions_dict_answers_gt.items()]

ground_truth_columns[:5]

In [None]:
ground_truth_columns_reversed = [value for key, value in elaborations_dict_reversed.items()]

ground_truth_columns_reversed[:5]

In [None]:
pairing_path = "data\\4_Pair_UXtweak_and_SurveyJS\\4_Pair_UXtweak_and_SurveyJS_data.csv"

In [None]:
pairing_df = pd.read_csv(pairing_path)
pairing_df = pairing_df[["group_evaluated", "order"] + ground_truth_columns]
# Rename group_evaluated to variant and order to respondent
pairing_df = pairing_df.rename(columns={"group_evaluated": "variant", "order": "respondent"})
# Replace 0.5 with 1
pairing_df = pairing_df.replace(0.5, 1)
# Add prefix respondent_ to values in order column
pairing_df["respondent"] = "respondent_" + pairing_df["respondent"].astype(str)
# Rename ground truth columns to match the ones in aggregated dataframes
pairing_df = pairing_df.rename(columns=elaborations_dict_reversed)
# Each elaboration should be in a separate row
pairing_df = pairing_df.melt(id_vars=["variant", "respondent"], value_vars=ground_truth_columns_reversed, var_name="page_name", value_name="indicator_fg")


In [None]:
pairing_df.groupby("indicator_fg").count()

In [None]:
pairing_df

In [None]:
pairing_df[pairing_df["indicator_fg"] == 0].count()

In [None]:
pairing_df["control"] = "control"

In [None]:
pairing_df[pairing_df["variant"] == "FG"]["respondent"].nunique()

In [None]:
pairing_df[pairing_df["variant"] == "H"]["respondent"].nunique()

In [None]:
fg_pairing_df_respondents_counts = pairing_df[pairing_df["variant"] == "FG"]["respondent"].value_counts()
fg_pairing_df_respondents_to_drop = fg_pairing_df_respondents_counts[fg_pairing_df_respondents_counts != 60].index.values.tolist()
fg_pairing_df_respondents_to_drop

In [None]:
h_pairing_df_respondents_counts = pairing_df[pairing_df["variant"] == "H"]["respondent"].value_counts()
h_pairing_df_respondents_to_drop = h_pairing_df_respondents_counts[h_pairing_df_respondents_counts != 60].index.values.tolist()
h_pairing_df_respondents_to_drop

## Add Ground Truth to datasets

In [None]:
print(f"merged_metrics_dataframes {len(merged_metrics_dataframes)}")
print(f"merged_trajectories_dataframes {len(merged_trajectories_dataframes)}")

In [None]:
merged_metrics_enriched = pd.merge(merged_metrics_dataframes, pairing_df, on=["variant", "respondent", "page_name"], how="outer")
merged_trajectories_enriched = pd.merge(merged_trajectories_dataframes, pairing_df, on=["variant", "respondent", "page_name"], how="outer")

print(f"merged_metrics_enriched {len(merged_metrics_enriched)}")
print(f"merged_trajectories_enriched {len(merged_trajectories_enriched)}")

In [None]:
merged_metrics_enriched[merged_metrics_enriched["indicator_fg"].isna()][["variant", "respondent"]].drop_duplicates()

In [None]:
merged_trajectories_enriched[merged_trajectories_enriched["indicator_fg"].isna()][["variant", "respondent"]].drop_duplicates()

In [None]:
# Drop rows with NaN values in ground_truth column
merged_metrics_enriched = merged_metrics_enriched.dropna(subset=["indicator_fg"])
merged_metrics_enriched = merged_metrics_enriched[merged_metrics_enriched["control"] == "control"].drop(columns=["control"])

print(f"merged_metrics_enriched {len(merged_metrics_enriched)}")
merged_metrics_enriched

In [None]:
# Drop rows with NaN values in ground_truth column
merged_trajectories_enriched = merged_trajectories_enriched.dropna(subset=["indicator_fg"])
merged_trajectories_enriched = merged_trajectories_enriched[merged_trajectories_enriched["control"] == "control"].drop(columns=["control"])

print(f"merged_trajectories_enriched {len(merged_trajectories_enriched)}")
merged_trajectories_enriched

In [None]:
print(merged_metrics_enriched[merged_metrics_enriched["variant"] == "FG"]["respondent"].nunique())
print(merged_metrics_enriched[merged_metrics_enriched["variant"] == "FG"]["respondent"].unique())

In [None]:
print(merged_metrics_enriched[merged_metrics_enriched["variant"] == "H"]["respondent"].nunique())
print(merged_metrics_enriched[merged_metrics_enriched["variant"] == "H"]["respondent"].unique())

In [None]:
print(merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "FG"]["respondent"].nunique())
print(merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "FG"]["respondent"].unique())

In [None]:
print(merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "H"]["respondent"].nunique())
print(merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "H"]["respondent"].unique())

In [None]:
print("Number of dropped metrics:", len(merged_metrics_dataframes) - len(merged_metrics_enriched))

In [None]:
print("Number of dropped trajectories:", len(merged_trajectories_dataframes) - len(merged_trajectories_enriched))

In [None]:
merged_metrics_enriched.groupby(["variant", "respondent", "page_name"]).count()["indicator_fg"]

In [None]:
merged_trajectories_enriched.groupby(["variant", "respondent", "page_name"]).count()["indicator_fg"]

In [None]:
check_metrics = merged_metrics_enriched.groupby(["variant", "respondent", "page_name"]).sum()    
check_metrics = check_metrics[check_metrics["indicator_fg"] == 0]
check_metrics.groupby(["variant", "respondent"]).count()["indicator_fg"].count()

In [None]:
check_trajectories = merged_trajectories_enriched.groupby(["variant", "respondent", "page_name"]).sum()
check_trajectories = check_trajectories[check_trajectories["indicator_fg"] == 0]
check_trajectories.groupby(["variant", "respondent"]).count()["indicator_fg"].count()

In [None]:
print(merged_metrics_enriched[merged_metrics_enriched["variant"] == "FG"]["indicator_fg"].sum(), merged_metrics_enriched[merged_metrics_enriched["variant"] == "H"]["indicator_fg"].sum(), len(merged_metrics_enriched))

In [None]:
print(merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "FG"]["indicator_fg"].sum(), merged_trajectories_enriched[merged_trajectories_enriched["variant"] == "H"]["indicator_fg"].sum(), len(merged_trajectories_enriched))

In [None]:
# Table of counts of indicator_fg per variant
table_metrics = pd.pivot_table(merged_metrics_enriched, values='indicator_fg', index=['variant'], aggfunc=np.sum)
table_metrics

In [None]:
# Table of counts of indicator_fg per variant
table_trajectories = pd.pivot_table(merged_trajectories_enriched, values='indicator_fg', index=['variant'], aggfunc=np.sum)
table_trajectories

In [None]:
merged_metrics_enriched.head()

In [None]:
merged_trajectories_enriched.head()

In [None]:
print("merged_metrics_enriched", len(merged_metrics_enriched))
print("merged_trajectories_enriched", len(merged_trajectories_enriched))

## Save dataframes

In [None]:
path_to_save_metrics = "data\\13_Mouse_Data_Preparation\\metrics_data.csv"
path_to_save_trajectories = "data\\13_Mouse_Data_Preparation\\trajectories_data.csv"

In [None]:
# Save the processed metrics data
merged_metrics_enriched.to_csv(path_to_save_metrics, index=False)

In [None]:
# Save the processed trajectories data
merged_trajectories_enriched.to_csv(path_to_save_trajectories, index=False)