In [None]:
import logging
from datetime import datetime

current_file_name = "6_Elaborations_Extraction"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# Use dic of pages to get the names of the pages containing elaborations
elaboration_pages = {
    "page_15": "elaboration_1_1_start",
    "page_16": "elaboration_1_1_end",
    "page_17": "elaboration_1_2_start",
    "page_18": "elaboration_1_2_end",

    "page_32": "elaboration_2_1_start",
    "page_33": "elaboration_2_1_end",
    "page_34": "elaboration_2_2_start",
    "page_35": "elaboration_2_2_end",

    "page_49": "elaboration_3_1_start",
    "page_50": "elaboration_3_1_end",
    "page_51": "elaboration_3_2_start",
    "page_52": "elaboration_3_2_end",

    "page_66": "elaboration_4_1_start",
    "page_67": "elaboration_4_1_end",
    "page_68": "elaboration_4_2_start",
    "page_69": "elaboration_4_2_end",

    "page_83": "elaboration_5_1_start",
    "page_84": "elaboration_5_1_end",
    "page_85": "elaboration_5_2_start",
    "page_86": "elaboration_5_2_end",
}

questions_before_elaborations = {
    "page_14": "question_before_1_1_end",
    "page_16": "question_before_1_2_end",
    "page_31": "question_before_2_1_end",
    "page_33": "question_before_2_2_end",
    "page_48": "question_before_3_1_end",
    "page_50": "question_before_3_2_end",
    "page_65": "question_before_4_1_end",
    "page_67": "question_before_4_2_end",
    "page_82": "question_before_5_1_end",
    "page_84": "question_before_5_2_end",
}

missing_values_map = {
    "elaboration_1_1_start": "question_before_1_1_end",
    "elaboration_1_2_start": "question_before_1_2_end",
    "elaboration_2_1_start": "question_before_2_1_end",
    "elaboration_2_2_start": "question_before_2_2_end",
    "elaboration_3_1_start": "question_before_3_1_end",
    "elaboration_3_2_start": "question_before_3_2_end",
    "elaboration_4_1_start": "question_before_4_1_end",
    "elaboration_4_2_start": "question_before_4_2_end",
    "elaboration_5_1_start": "question_before_5_1_end",
    "elaboration_5_2_start": "question_before_5_2_end",
}

In [None]:
def get_elaborations_from_csv(csv_df, elaboration_pages, questions_before_elaborations):
    # Get keys of elaborations from values of dictionary
    elaboration_keys = list(elaboration_pages.keys())
    questions_keys = list(questions_before_elaborations.keys())

    # Get first and last row of each elaboration
    first_rows = csv_df.loc[csv_df['page_name'].isin(elaboration_keys)].groupby('page_name').first()["page_timestamp"]
    last_rows = csv_df.loc[csv_df['page_name'].isin(questions_keys)].groupby('page_name').last()["accurate_timestamp"]

    # Replace key with value
    first_rows.index = first_rows.index.map(elaboration_pages)
    last_rows.index = last_rows.index.map(questions_before_elaborations)

    # Concatenate first and last rows
    first_last_rows = pd.concat([first_rows, last_rows], axis=1)

    # Transpose
    first_last_rows = first_last_rows.T

    # To dict
    first_last_rows = first_last_rows.to_dict()

    # For each key, select only one timestamp - the one that is not NaN
    for key in first_last_rows:
        if not pd.isna(first_last_rows[key]['page_timestamp']):
            first_last_rows[key] = first_last_rows[key]['page_timestamp']
        else:
            first_last_rows[key] = first_last_rows[key]['accurate_timestamp']
    
    return first_last_rows

In [None]:
def fix_missing_values(df, missing_values_map):
    # Replace NaN with value from another column, which name is in missing_values_map
    for column in missing_values_map:
        df[column] = df[column].fillna(df[missing_values_map[column]])

    return df

In [None]:
def exctract_big_5_answers_from_interactions(path, elaboration_pages, questions_before_elaborations, missing_values_map):
    # Traverse through all files in the directory
    folders = os.listdir(path)
    
    columns = ["order", *list(elaboration_pages.values()), *list(questions_before_elaborations.values())]
    print(columns)
    df = pd.DataFrame(columns=columns)

    for folder in folders:
        files = os.listdir(path + "\\" + folder)
        for file in files:
            if file.endswith(".csv"):
                csv_df = pd.read_csv(path + "\\" + folder + "\\" + file)
                
                # Get number from the folder name
                number = folder.split("_")[1]
                elaborations = get_elaborations_from_csv(csv_df, elaboration_pages, questions_before_elaborations)
                elaborations["order"] = number
                elaborations_df = pd.DataFrame(elaborations, index=[0])
                df = pd.concat([df, elaborations_df], ignore_index=True)
    
    df = fix_missing_values(df, missing_values_map)
    df = df.drop(columns=list(questions_before_elaborations.values()))
    # Convert milliseconds to seconds
    df = df.apply(lambda x: x/1000 if x.name != "order" else x)
    # Add columns with minutes and seconds
    for column in list(elaboration_pages.values()):
        df[column + "_minutes"] = df[column] // 60
        df[column + "_seconds"] = df[column] % 60

    # Remove NaN rows
    df = df.dropna()
    
    # Make order integer
    df["order"] = df["order"].astype(int)
    
    return df
                

In [None]:
# Getting manually annotated data - offset
# Offset is the first whole second after clicking yellow "Let's start" button
path_to_sessions = "data\\0_Raw_Data\\uxtweak_sessions.csv"
sessions = pd.read_csv(path_to_sessions, delimiter=";")

logging.info("Sessions loaded")
logging.info(f"Sessions shape: {sessions.shape}")

sessions = sessions[["Variant", "Respondent", "Offset"]]

In [None]:
sessions

In [None]:
path_fg = 'data\\3_UXtweak_Mouse_Data_Processing\\FG'
path_h = 'data\\3_UXtweak_Mouse_Data_Processing\\H'

extracted_fg = exctract_big_5_answers_from_interactions(path_fg, elaboration_pages, questions_before_elaborations, missing_values_map)
extracted_fg["group"] = "FG"
logging.info("FG Big 5 extracted")
logging.info(f"FG shape: {extracted_fg.shape}")

extracted_h = exctract_big_5_answers_from_interactions(path_h, elaboration_pages, questions_before_elaborations, missing_values_map)
extracted_h["group"] = "H"
logging.info("H Big 5 extracted")
logging.info(f"H shape: {extracted_h.shape}")

In [None]:
extracted_fg = extracted_fg.merge(sessions[sessions["Variant"] == "FG"][["Respondent", "Offset"]], left_on="order", right_on="Respondent",)
extracted_h = extracted_h.merge(sessions[sessions["Variant"] == "H"][["Respondent", "Offset"]], left_on="order", right_on="Respondent")

In [None]:
extracted_fg.sort_values(by="order", inplace=False)

In [None]:
extracted_h.sort_values(by="order", inplace=False)

In [None]:
@timer
def extract_video_chunks(video_path, start_end, save_path, name):
    logging.info(f"Extracting video chunk from {video_path} to {save_path} with name {name}")

    start, end = start_end

    start = round(start, 0)
    end = round(end, 0)
    
    start -= 5
    end += 5

    command = f'ffmpeg -i "{video_path}" -ss {start} -to {end} -c:v libx264 -c:a aac "{save_path}\\{name}".mp4'
    os.system(command=command)
    logging.info(f"Extracted video chunk from {video_path} to {save_path} with name {name} - {start} - {end}")

    extract_sound_only_command = f'ffmpeg -i "{save_path}\\{name}.mp4" -vn -acodec copy "{save_path}\\{name}.aac"'
    os.system(command=extract_sound_only_command)
    logging.info(f"Extracted sound from video chunk {name}")

    # Delete existing wav file
    if os.path.exists(f"{save_path}\\{name}.wav"):
        os.remove(f"{save_path}\\{name}.wav")
        logging.info(f"Deleted existing wav file {name}")

    # Extract sound only from video in wav format without any switches
    extract_sound_only_command = f'ffmpeg -i "{save_path}\\{name}.mp4" "{save_path}\\{name}.wav"'
    os.system(command=extract_sound_only_command)
    logging.info(f"Extracted sound from video chunk {name}")

In [None]:
def extract_elaborations_from_video(df):
    for index, row in df.iterrows():
        video_path = f'data\\0_Raw_Data\\All_original_videos\\{row["group"]}\\Respondent_{row["order"]}.mp4'
        save_path = f'data\\6_Elaborations_Extraction\\{row["group"]}\\respondent_{row["order"]}'

        if os.path.exists(save_path):
            logging.info(f"Elaborations for respondent {row['group']} {row['order']} already extracted")
            continue
        else:
            os.makedirs(save_path)

        logging.info(f"Extracting elaborations for respondent {row['group']} {row['order']}: {video_path}")

        offset = row["Offset"]
        elaborations = {}

        for i in range(1, 6):
            for j in range(1, 3):
                start = row[f"elaboration_{i}_{j}_start"] + offset
                end = row[f"elaboration_{i}_{j}_end"] + offset
                elaborations[f"elaboration_{i}_{j}"] = (start, end)

        logging.info(f"Extracted timestamps for respondent {row['group']} {row['order']}: {video_path}")

        for key in elaborations:
            extract_video_chunks(video_path, elaborations[key], save_path, key)

        logging.info(f"Extracted elaborations for respondent {row['group']} {row['order']}: {video_path}")

In [None]:
extract_elaborations_from_video(extracted_fg)

In [None]:
extract_elaborations_from_video(extracted_h)