In [None]:
import logging
from datetime import datetime

current_file_name = "4_Pair_UXtweak_and_SurveyJS"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [None]:
from helpers.questions import *
from helpers.constants import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# Read in the data
respondents_fg = pd.read_csv(wd + '2 UXtweak CSVs\\[DP Lies] Final 1 FG\\[DP Lies] Final 1 FG - Respondents.csv')
respondents_h = pd.read_csv(wd + '2 UXtweak CSVs\\[DP Lies] Final 1 H\\[DP Lies] Final 1 H - Respondents.csv')

logging.info("Data read in")

In [None]:
def clean_respondents(respondents_out):
    respondents = respondents_out.copy(deep=True)
    
    respondents = respondents[respondents['location'] != 'SK']
    respondents = respondents[respondents['status'] == 'completed']
    respondents = respondents[respondents['included in analysis'] == True]

    respondents['ended_at'] = pd.to_datetime(respondents['started at']) + pd.to_timedelta(respondents['time taken'])

    drop_cols = ['identifier', 'ip', 'status', 'included in analysis', 'questions answered',
                 'tasks completed', 'tasks skipped', 'tasks closed', 'tasks successful']
    respondents = respondents.drop(drop_cols, axis=1)

    respondents = respondents[respondents['time taken'] > '00:10:00']

    respondents = respondents.sort_values(by='ended_at')

    return respondents

clean_respondents_fg = clean_respondents(respondents_fg)
clean_respondents_h = clean_respondents(respondents_h)

logging.info("Respondents cleaned")
logging.info("Respondents FG: " + str(clean_respondents_fg.shape))
logging.info("Respondents H: " + str(clean_respondents_h.shape))

In [None]:
clean_respondents_fg.head()

In [None]:
clean_respondents_h.head()

In [None]:
evaluated_big_5 = pd.read_csv('data\\1_SurveyJS_Big5_Data_Processing\\1_SurveyJS_Big5_Data_Processing_data.csv')
evaluated_big_5.head()

In [None]:
def encode_answers(df, glob_all_columns):
    # Convert all global columns to int
    df[glob_all_columns] = df[glob_all_columns].astype(int)
    # Concat values from all columns from glob_all_columns
    df['encoded'] = df[glob_all_columns].apply(lambda row: ''.join(row.values.astype(str)), axis=1)
    return df

In [None]:
evaluated_big_5_encoded = encode_answers(evaluated_big_5, glob_all_columns)

evaluated_big_5_encoded_fg = evaluated_big_5_encoded[evaluated_big_5_encoded['group'] == 'FG']
evaluated_big_5_encoded_h = evaluated_big_5_encoded[evaluated_big_5_encoded['group'] == 'H']

In [None]:
def get_answers_from_csv(csv_df, glob_all_columns):
    csv_df = csv_df[csv_df['type'] == 'click']
    csv_df = csv_df[csv_df['text'].isin(["Disagree strongly", "Disagree", "Neutral", "Agree", "Agree strongly"])]

    # From each page get only the last answer
    csv_df = csv_df.drop_duplicates(subset=['page_name'], keep='last')

    # Parse the page name to get the page number
    csv_df['page_number'] = csv_df['page_name'].str.extract('(\d+)').astype(int)

    # Page number should be lower or equal to 81
    csv_df = csv_df[csv_df['page_number'] <= 81]

    # Get list of text answers
    csv_df = list(csv_df["text"])

    # Create a dictionary with the answers
    csv_df = dict(zip(glob_all_columns, csv_df))

    return csv_df

In [None]:
def exctract_big_5_answers_from_interactions(path, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert):
    # Traverse through all files in the directory
    folders = os.listdir(path)
    
    df = pd.DataFrame(columns=glob_all_columns + ['order'])

    for folder in folders:
        files = os.listdir(path + "\\" + folder)
        for file in files:
            if file.endswith(".csv"):
                csv_df = pd.read_csv(path + "\\" + folder + "\\" + file)

                # Get number from the folder name
                number = folder.split("_")[1]
                answers_dictionary = get_answers_from_csv(csv_df, glob_all_columns)
                # If all answers are NaN, skip the file
                if all(value is np.nan for value in answers_dictionary.values()):
                    continue
                answers_dictionary['order'] = number

                answers_df = pd.DataFrame(answers_dictionary, index=[0])
                df = pd.concat([df, answers_df], ignore_index=True)
    
    df.update(df[list(glob_normal_columns)].apply(lambda col: col.map(glob_normal_likert)))
    df.update(df[list(glob_reversed_columns)].apply(lambda col: col.map(glob_reverse_likert)))

    return df
                

In [None]:
path_fg = 'data\\3_UXtweak_Mouse_Data_Processing\\FG'
path_h = 'data\\3_UXtweak_Mouse_Data_Processing\\H'

extracted_fg = exctract_big_5_answers_from_interactions(path_fg, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert)
extracted_fg["group"] = "FG"

extracted_h = exctract_big_5_answers_from_interactions(path_h, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert)
extracted_h["group"] = "H"

In [None]:
#TODO Fix respondent H 40
extracted_h = extracted_h[extracted_h['order'] != '40']

In [None]:
extracted_fg_encoded = encode_answers(extracted_fg, glob_all_columns)
extracted_h_encoded = encode_answers(extracted_h, glob_all_columns)

In [None]:
# Full outer join on the encoded column
merged_fg = pd.merge(evaluated_big_5_encoded_fg, extracted_fg_encoded, on='encoded', how='inner', suffixes=('_evaluated', '_extracted'))
merged_h = pd.merge(evaluated_big_5_encoded_h, extracted_h_encoded, on='encoded', how='inner', suffixes=('_evaluated', '_extracted'))

In [None]:
path_to_save = "data\\4_Pair_UXtweak_and_SurveyJS\\"

concatenated = pd.concat([merged_fg, merged_h], ignore_index=True)

concatenated.to_csv(path_to_save + "4_Pair_UXtweak_and_SurveyJS_data.csv", index=False)

In [None]:
print(len(concatenated))

In [None]:
print(len(evaluated_big_5_encoded_fg), len(extracted_fg_encoded), len(merged_fg))

In [None]:
merged_fg

In [None]:
print(len(evaluated_big_5_encoded_h), len(extracted_h_encoded), len(merged_h))

In [None]:
merged_h