In [None]:
import logging
from datetime import datetime

current_file_name = "3_UXtweak_Mouse_Data_Processing"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import os
import json

In [None]:
from helpers.constants import *
from helpers.utils import *
from helpers.pages import *

In [None]:
pd.set_option("display.max_columns", 500)

In [None]:
path_fg = f"data\\2_UXtweak_Mouse_Data_Downloading\\FG"
path_h = f"data\\2_UXtweak_Mouse_Data_Downloading\\H"

In [None]:
def path_finder(path, mode, variant):
    # Traverse all directories and find paths to files that contains "mode" in their name
    file_paths = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if mode in file:
                file_path = os.path.join(root, file)
                folder_name = os.path.basename(os.path.dirname(file_path))
                file_paths.append([folder_name, file_path])
    
    if mode == "_baked_":
        path_column = "baked_file_path"
    elif mode == "_raw_":
        path_column = "raw_file_path"
    else:
        raise ValueError("Invalid mode")
    
    df = pd.DataFrame(file_paths, columns=["folder", path_column])

    return df

In [None]:
baked_fg_paths = path_finder(path_fg, "_baked_", "FG")
raw_fg_paths = path_finder(path_fg, "_raw_", "FG")
baked_h_paths = path_finder(path_h, "_baked_", "H")
raw_h_paths = path_finder(path_h, "_raw_", "H")

In [None]:
df_fg = baked_fg_paths.merge(raw_fg_paths, on="folder")
df_fg["type"] = "FG"

df_h = baked_h_paths.merge(raw_h_paths, on="folder")
df_h["type"] = "H"

df = pd.concat([df_fg, df_h])

# Reindex
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

In [None]:
def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z

In [None]:
def get_events_from_baked_file(path):
  logging.info(f"Reading baked events from {path}")
  
  with open(path) as loadfile:
      data = json.load(loadfile)

  for pageview_count, pageview in enumerate(data['pageviews']):
    pageview_info = {
        "pageview_screenWidth": pageview["screenWidth"],
        "pageview_screenHeight": pageview["screenHeight"],
        "pageview_width": pageview["width"],
        "pageview_height": pageview["height"],
        "pageview_duration": pageview["duration"],
        "pageview_inactivity": pageview["inactivity"],
        "pageview_startedAt": pageview["startedAt"],
        "pageview_clientStartedAt": pageview["clientStartedAt"]
    }

    columns = ["type", "baked_id", "clientX", "clientY", "x",
                "y", "duration", "at", "maxscroll", "text", "target",
                "pageview_screenWidth", "pageview_screenHeight", "pageview_width",
                "pageview_height", "pageview_duration", 
                # "pageview_inactivity",
                "pageview_startedAt", "pageview_clientStartedAt"]
    
    events = []

    for baked_count, baked in enumerate(pageview['baked']):
      if baked['type'] == 'move':
        for position_count, position in enumerate(baked['args']['position']):
          move_events = {"type": "move", "baked_id": position_count, "clientX": position["clientX"], "clientY": position[
                        "clientY"], "x": position["x"], "y": position["y"], "duration": baked['args']['duration'], "at": baked['at']}
          events.append(merge_two_dicts(move_events, pageview_info))

      if baked['type'] == 'scroll':
        maxscroll = baked['args']['maxScroll']
        scroll_events = {"type": "scroll", "at": baked['at'], "maxscroll": maxscroll}
        events.append(merge_two_dicts(scroll_events, pageview_info))

      if baked['type'] == 'click':
        try:
          text = baked['args']['text']
        except:
          text = -1
        position = baked['args']['position']
        try:
          target = baked['args']['target']
        except:
          target = -1
        at = baked['at']
        click_events = {"type": "click", "text": text, "clientX": position["clientX"], "clientY": position[
                      "clientY"], "x": position["x"], "y": position["y"], "target": target, "at": at}
        events.append(merge_two_dicts(click_events, pageview_info))

  df = pd.DataFrame(events, columns=columns)
  return df

In [None]:
def add_assumed_page_index(df_out):
    # Each page starts with movements and ends with a click on the next page.
    # - Some pages have one click on answer and then a click on the next page
    # - Some pages have one click on the next page and no answer click
    # - Some pages have multiple answer clicks and then a click on the next page
    # - There are 5 block with 12 questions each

    logging.info("Adding assumed page index")
    
    df = df_out.copy()
    
    # Add page index
    df["assumed_page_index"] = 0
    assumed_page_index = 0
    assumed_page_index_increment = False

    # Add question page
    df["question_assumed_page_index"] = 0
    question_assumed_page_index = 1
    question_assumed_page_index_increment = False

    # Add event index
    df["event_index"] = 0

    for i in range(len(df)):
        if df["type"][i] == "click":
            if df["text"][i] in ["Disagree strongly", "Disagree", "Neutral", "Agree", "Agree strongly"]:
                question_assumed_page_index_increment = True
            if df["text"][i] == "Next":
                assumed_page_index_increment = True

        df.loc[i, "assumed_page_index"] = assumed_page_index

        if question_assumed_page_index_increment:
            df.loc[i, "question_assumed_page_index"] = question_assumed_page_index

        if assumed_page_index_increment:
            assumed_page_index += 1
            assumed_page_index_increment = False
            if question_assumed_page_index_increment:
                question_assumed_page_index += 1
                question_assumed_page_index_increment = False
        
        df.loc[i, "event_index"] = i

    return df


In [None]:
def get_accurate_timestamp_from_raw_data(df_baked, path):
    logging.info(f"Getting accurate timestamp from raw data for {path}")

    with open(path) as loadfile:
        data = json.load(loadfile)

    position_in_data_move = 0
    position_in_data_click = 0
    
    df_baked["accurate_timestamp"] = 0

    for i in range(len(df_baked)):
        baked_type = df_baked["type"][i]
        clientX = df_baked["clientX"][i]
        clientY = df_baked["clientY"][i]

        if baked_type == "move":
            for j in range(position_in_data_move, len(data)):
                current = data[j]
                if current["type"] == 10 and current["args"][0] == clientX and current["args"][1] == clientY:
                    df_baked.loc[i, "accurate_timestamp"] = current["at"]
                    position_in_data_move = j + 1
                    break
        elif baked_type == "click":
            for j in range(position_in_data_click, len(data)):
                current = data[j]
                if current["type"] == 14 and current["args"][1] == clientX and current["args"][2] == clientY:
                    df_baked.loc[i, "accurate_timestamp"] = current["at"]
                    position_in_data_click = j + 1
                    break
        elif baked_type == "scroll":
            df_baked.loc[i, "accurate_timestamp"] = df_baked["at"][i]

    return df_baked
        
    

In [None]:
def page_load_timestamps(pages_df, path):
    logging.info(f"Loading page load timestamps for {path}")

    with open(path) as loadfile:
        data = json.load(loadfile)
    
    position_in_data = 0

    pages_df["pageload_timestamp"] = 0

    for i in range(1, len(pages_df)):
        value = pages_df["page_data"][i]

        for j in range(position_in_data, len(data)):
            current = data[j]
            if current["type"] == 6 and value in current["args"][2]:
                pages_df.loc[i, "pageload_timestamp"] = current["at"]
                position_in_data = j + 1
                break

    return pages_df    

In [None]:
def order_dataset_based_on_timestamps(df):
    logging.info("Ordering dataset based on timestamps")
    
    # Sort based on accurate_timestamp
    df = df.sort_values(by=["accurate_timestamp"])
    return df

In [None]:
def add_page_name_timestamps(df, pages_df):
    logging.info("Adding page name and page timestamp to the dataset")
    
    df = df.copy()
    pages_df = pages_df.copy()

    df["page_name"] = ""
    df["page_timestamp"] = 0

    for i in range(len(df)):
        event_timestamp = df["accurate_timestamp"][i]
        for j in range(1, len(pages_df)):
            page_timestamp = pages_df["pageload_timestamp"][j]
            if event_timestamp < page_timestamp:
                df.loc[i, "page_name"] = pages_df["page_name"][j-1]
                df.loc[i, "page_timestamp"] = pages_df["pageload_timestamp"][j-1]
                break
        # If the event is after the last page load, assign the last page name
        if df.loc[i, "page_name"] == "":
            df.loc[i, "page_name"] = pages_df["page_name"][len(pages_df)-1]
            df.loc[i, "page_timestamp"] = pages_df["pageload_timestamp"][len(pages_df)-1]

    return df

In [None]:
@timer
def get_events_with_pages_and_timestamps(df, pages_df):
    for i in range(len(df)):
        logging.info(f"Processing {i}/{len(df)}")
        
        baked_data_path = df["baked_file_path"][i]
        raw_data_path = df["raw_file_path"][i]

        folder_path = os.path.dirname(baked_data_path)
        logging.info(f"Processing {folder_path}")

        folder_name = os.path.basename(folder_path)

        output_folder_path = folder_path.replace("2_UXtweak_Mouse_Data_Downloading", current_file_name)
        # Create output folder if it does not exist. If exists, continue with next iteration
        if os.path.exists(output_folder_path):
            logging.info(f"Output folder exists. Skipping {folder_path}")
            continue

        os.makedirs(output_folder_path, exist_ok=True)
        logging.info(f"Output folder: {output_folder_path}")

        events = get_events_from_baked_file(baked_data_path)
        events = add_assumed_page_index(events)
        events = get_accurate_timestamp_from_raw_data(events, raw_data_path)
        events = order_dataset_based_on_timestamps(events)
        pages_df = page_load_timestamps(pages_df, raw_data_path)
        events = add_page_name_timestamps(events, pages_df)

        output_path = f"{output_folder_path}\\{folder_name}_processed_events.csv"
        events.to_csv(output_path, index=False)
        logging.info(f"Saved to {output_path}")

In [None]:
# Create a dataframe from pages dictionary. Key is the page name and value is the page data
pages_df = pd.DataFrame.from_dict(pages, orient="index").reset_index()
pages_df.columns = ["page_name", "page_data"]

In [None]:
pages_df.head()

In [None]:
get_events_with_pages_and_timestamps(df, pages_df)

In [None]:
# 97 pages, 10 elaborations without Next button, 1 final page without Next button - 86 pages with Next button

In [None]:
# Iterate over all files and get events from them

enriches_rows = []

for index, row in df.iterrows():
    baked_path = row["baked_file_path"]
    events = get_events_from_baked_file(baked_path)
    events = events[events["type"] == "click"]

    counts_of_click_text = events["text"].value_counts()
    # Check if all the texts are present in the counts and if not, add them with 0
    for text in ["Next", "Disagree strongly", "Disagree", "Neutral", "Agree", "Agree strongly", "Complete"]:
        if text not in counts_of_click_text.index:
            counts_of_click_text[text] = 0
    # Get only Next, Disagree strongly, Disagree, Neutral, Agree, Agree strongly and Complete
    counts_of_click_text = counts_of_click_text[[
        "Next", "Disagree strongly", "Disagree", "Neutral", "Agree", "Agree strongly", "Complete"]]
    
    dict_counts_of_click_text = counts_of_click_text.to_dict()
    # Append it to the row
    dict_from_row = row.to_dict()
    merged_dict = {**dict_from_row, **dict_counts_of_click_text}
    enriches_rows.append(merged_dict)

df_enriched = pd.DataFrame(enriches_rows)
df_enriched

In [None]:
min(df_enriched["Next"])