In [None]:
import logging
from datetime import datetime

current_file_name = "1_SurveyJS_Big5_Data_Processing"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [None]:
from helpers.questions import *
from helpers.constants import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
# Read excel sheet results
survey_fg = pd.read_excel(wd + '\\3 SurveyJS Results\\DP-Final-1-FG.xlsx')
survey_h = pd.read_excel(wd + '\\3 SurveyJS Results\\DP-Final-1-H.xlsx')

survey_fg_pilot = pd.read_excel(wd_pilot + '\\3 SurveyJS Results\\DP-Pilot-1-FG.xlsx')
survey_h_pilot = pd.read_excel(wd_pilot + '\\3 SurveyJS Results\\DP-Pilot-1-H.xlsx')

logging.info("SurveyJS results read in")

In [None]:
print(len(survey_fg), len(survey_h), len(survey_fg_pilot), len(survey_h_pilot))

In [None]:
# Concatenate pilot and main data
survey_fg = pd.concat([survey_fg, survey_fg_pilot])
survey_h = pd.concat([survey_h, survey_h_pilot])

In [None]:
print(len(survey_fg), len(survey_h))

In [None]:
def clean_survey(survey_out, glob_big5_questions, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert):
    survey = survey_out.copy(deep=True)

    # In column names, replace question text from glob_big5_questions with question number from glob_all_columns
    for i in range(len(glob_big5_questions)):
        survey.columns = survey.columns.str.replace(glob_big5_questions[i], glob_all_columns[i])
    
    # Remove 'Choose honestly whether each answer is a Lie, a Half-truth, or a Truth -' from column names
    survey.columns = survey.columns.str.replace('Choose honestly whether each answer is a Lie, a Half-truth, or a Truth - ', '')
    # Replace  - Answer with _A
    survey.columns = survey.columns.str.replace(' - Answer', '_a')
    # Replace  - Ground Truth with _gt
    survey.columns = survey.columns.str.replace(' - Ground Truth', '_gt')

    # Remove columns starting with 'instructions' and 'elaboration'
    survey = survey.loc[:,~survey.columns.str.startswith('instructions')]
    survey = survey.loc[:,~survey.columns.str.startswith('elaboration')]

    # Order by Submited
    survey = survey.sort_values(by='Submitted')

    # Compare each question column with its _a column and thwow error if they are not the same
    for i in range(len(glob_all_columns)):
        if not survey[glob_all_columns[i]].equals(survey[glob_all_columns[i] + '_a']):
            print("Error in column " + glob_all_columns[i])
            print(survey[glob_all_columns[i]], survey[glob_all_columns[i] + '_a'])
            # Throw error
            raise ValueError('Columns are not the same')
        
    # Remove _a columns
    survey = survey.loc[:,~survey.columns.str.endswith('_a')]

    # Replace text answers with numbers
    survey.update(survey[list(glob_normal_columns)].apply(lambda col: col.map(glob_normal_likert)))
    survey.update(survey[list(glob_reversed_columns)].apply(lambda col: col.map(glob_reverse_likert)))

    # Replace text answers in _gt column with numbers
    all_gt_columns = [col + '_gt' for col in glob_all_columns]
    survey.update(survey[list(all_gt_columns)].apply(lambda col: col.map(glob_gt_map)))

    # Submitted column to datetime without timezone
    survey['Submitted'] = pd.to_datetime(survey['Submitted']).dt.tz_localize(None)

    # Order by Submitted
    survey = survey.sort_values(by='Submitted')

    # Remove summission between 2024-01-10 00:00:00 and 2024-01-25 00:00:00 as they are test submissions
    survey = survey[(survey['Submitted'] < '2024-01-10 00:00:00') | (survey['Submitted'] > '2024-01-25 00:00:00')]

    return survey

survey_fg_clean = clean_survey(survey_fg, glob_big5_questions, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert)
survey_h_clean = clean_survey(survey_h, glob_big5_questions, glob_all_columns, glob_normal_columns, glob_reversed_columns, glob_normal_likert, glob_reverse_likert)

logging.info("SurveyJS results cleaned")
logging.info("Survey FG: " + str(survey_fg_clean.shape))
logging.info("Survey H: " + str(survey_h_clean.shape))

In [None]:
print(len(survey_fg_clean), len(survey_h_clean))

In [None]:
survey_h_clean

In [None]:
bfi2_e_sociability_columns = ["bfi1", "rbfi16", "rbfi31", "bfi46"]
bfi2_e_assertiveness_columns = ["bfi6", "bfi21", "rbfi36", "rbfi51"]
bfi2_e_energy_level_columns = ["rbfi11", "rbfi26", "bfi41", "bfi56"]
bfi2_a_compassion_columns = ["bfi2", "rbfi17", "bfi32", "rbfi47"]
bfi2_a_respectfulness_columns = ["bfi7", "rbfi22", "rbfi37", "bfi52"]
bfi2_a_trust_columns = ["rbfi12", "bfi27", "rbfi42", "bfi57"]
bfi2_c_organization_columns = ["rbfi3", "bfi18", "bfi33", "rbfi48"]
bfi2_c_productiveness_columns = ["rbfi8", "rbfi23", "bfi38", "bfi53"]
bfi2_c_responsibility_columns = ["bfi13", "rbfi28", "bfi43", "rbfi58"]
bfi2_n_anxiety_columns = ["rbfi4", "bfi19", "bfi34", "rbfi49"]
bfi2_n_depression_columns = ["rbfi9", "rbfi24", "bfi39", "bfi54"]
bfi2_n_emotional_volatility_columns = ["bfi14", "rbfi29", "rbfi44", "bfi59"]
bfi2_o_intellectual_curiosity_columns = ["bfi10", "rbfi25", "bfi40", "rbfi55"]
bfi2_o_aesthetic_sensitivity_columns = ["rbfi5", "bfi20", "bfi35", "rbfi50"]
bfi2_o_creative_imagination_columns = ["bfi15", "rbfi30", "rbfi45", "bfi60"]

bfi2_e_columns = bfi2_e_sociability_columns + bfi2_e_assertiveness_columns + bfi2_e_energy_level_columns
bfi2_a_columns = bfi2_a_compassion_columns + bfi2_a_respectfulness_columns + bfi2_a_trust_columns
bfi2_c_columns = bfi2_c_organization_columns + bfi2_c_productiveness_columns + bfi2_c_responsibility_columns
bfi2_n_columns = bfi2_n_anxiety_columns + bfi2_n_depression_columns + bfi2_n_emotional_volatility_columns
bfi2_o_columns = bfi2_o_intellectual_curiosity_columns + bfi2_o_aesthetic_sensitivity_columns + bfi2_o_creative_imagination_columns

In [None]:
def evaluate_big5(clean_survey_out, glob_all_columns):
    clean_survey = clean_survey_out.copy(deep=True)

    clean_survey.insert(0, "total_points", clean_survey.loc[:, glob_all_columns].sum(axis=1))
    clean_survey.insert(1, "mean_points", clean_survey.loc[:, glob_all_columns].mean(axis=1))

    clean_survey.insert(2, "bfi2_e_sociability", clean_survey.loc[:,  bfi2_e_sociability_columns].mean(axis=1))
    clean_survey.insert(3, "bfi2_e_assertiveness", clean_survey.loc[:,  bfi2_e_assertiveness_columns].mean(axis=1))
    clean_survey.insert(4, "bfi2_e_energy_level", clean_survey.loc[:,  bfi2_e_energy_level_columns].mean(axis=1))
    clean_survey.insert(5, "bfi2_a_compassion", clean_survey.loc[:,  bfi2_a_compassion_columns].mean(axis=1))
    clean_survey.insert(6, "bfi2_a_respectfulness", clean_survey.loc[:,  bfi2_a_respectfulness_columns].mean(axis=1))
    clean_survey.insert(7, "bfi2_a_trust", clean_survey.loc[:,  bfi2_a_trust_columns].mean(axis=1))
    clean_survey.insert(8, "bfi2_c_organization", clean_survey.loc[:,  bfi2_c_organization_columns].mean(axis=1))
    clean_survey.insert(9, "bfi2_c_productiveness", clean_survey.loc[:,  bfi2_c_productiveness_columns].mean(axis=1))
    clean_survey.insert(10, "bfi2_c_responsibility", clean_survey.loc[:,  bfi2_c_responsibility_columns].mean(axis=1))
    clean_survey.insert(11, "bfi2_n_anxiety", clean_survey.loc[:,  bfi2_n_anxiety_columns].mean(axis=1))
    clean_survey.insert(12, "bfi2_n_depression", clean_survey.loc[:,  bfi2_n_depression_columns].mean(axis=1))
    clean_survey.insert(13, "bfi2_n_emotional_volatility", clean_survey.loc[:,  bfi2_n_emotional_volatility_columns].mean(axis=1))
    clean_survey.insert(14, "bfi2_o_intellectual_curiosity", clean_survey.loc[:,  bfi2_o_intellectual_curiosity_columns].mean(axis=1))
    clean_survey.insert(15, "bfi2_o_aesthetic_sensitivity", clean_survey.loc[:,  bfi2_o_aesthetic_sensitivity_columns].mean(axis=1))
    clean_survey.insert(16, "bfi2_o_creative_imagination", clean_survey.loc[:,  bfi2_o_creative_imagination_columns].mean(axis=1))

    clean_survey.insert(2, "bfi2_e", clean_survey.loc[:,  bfi2_e_columns].mean(axis=1))
    clean_survey.insert(3, "bfi2_a", clean_survey.loc[:,  bfi2_a_columns].mean(axis=1))
    clean_survey.insert(4, "bfi2_c", clean_survey.loc[:,  bfi2_c_columns].mean(axis=1))
    clean_survey.insert(5, "bfi2_n", clean_survey.loc[:,  bfi2_n_columns].mean(axis=1))
    clean_survey.insert(6, "bfi2_o", clean_survey.loc[:,  bfi2_o_columns].mean(axis=1))

    clean_survey.insert(2, "mean_points_check", clean_survey.loc[:, ["bfi2_e", "bfi2_a", "bfi2_c", "bfi2_n", "bfi2_o"]].mean(axis=1))

    # Add occurences of '1', '0.5' and '0' in ground truth columns
    all_gt_columns = [col + '_gt' for col in glob_all_columns]
    clean_survey.insert(2, "gt_lies", clean_survey[all_gt_columns].eq(1).sum(axis=1))
    clean_survey.insert(3, "gt_half_truths", clean_survey[all_gt_columns].eq(0.5).sum(axis=1))
    clean_survey.insert(4, "gt_truths", clean_survey[all_gt_columns].eq(0).sum(axis=1))
    
    return clean_survey

evaluated_fg = evaluate_big5(survey_fg_clean, glob_all_columns)
evaluated_h = evaluate_big5(survey_h_clean, glob_all_columns)

logging.info("Big5 evaluated")
logging.info("Evaluated FG: " + str(evaluated_fg.shape))
logging.info("Evaluated H: " + str(evaluated_h.shape))

In [None]:
evaluated_h

In [None]:
def merge_fg_h(evaluated_fg_out, evaluated_h_out):
    evaluated_fg = evaluated_fg_out.copy(deep=True)
    evaluated_h = evaluated_h_out.copy(deep=True)

    evaluated_fg.insert(0, "group", 'FG')
    evaluated_h.insert(0, "group", 'H')

    merged = pd.concat([evaluated_fg, evaluated_h])

    return merged

data = merge_fg_h(evaluated_fg, evaluated_h)

logging.info("Data merged")
logging.info("Data: " + str(data.shape))

In [None]:
# Save the data
output_file = f'data\\{current_file_name}\\{current_file_name}_data.csv'
data.to_csv(output_file, index=False)

logging.info("Data saved to " + output_file)

In [None]:
data

In [None]:
print(f"Lies: {data['gt_lies'].sum()}")
print(f"Half-truths: {data['gt_half_truths'].sum()}")
print(f"Truths: {data['gt_truths'].sum()}")

In [None]:
def plot_boxplots(data):
    # Plot boxplots for each big5 trait and group next to each other

    # Create figure with 5 subplots
    fig, axes = plt.subplots(1, 5, figsize=(15, 5))

    # Create boxplots for each big5 trait and group
    sns.boxplot(ax=axes[0], x='group', y='bfi2_e', data=data)
    sns.boxplot(ax=axes[1], x='group', y='bfi2_a', data=data)
    sns.boxplot(ax=axes[2], x='group', y='bfi2_c', data=data)
    sns.boxplot(ax=axes[3], x='group', y='bfi2_n', data=data)
    sns.boxplot(ax=axes[4], x='group', y='bfi2_o', data=data)

    # Set titles
    axes[0].set_title('Extraversion')
    axes[1].set_title('Agreeableness')
    axes[2].set_title('Conscientiousness')
    axes[3].set_title('Neuroticism')
    axes[4].set_title('Openness')

plot_boxplots(data)

In [None]:
def plot_boxplots_facets(data):
    # Plot boxplots for each big5 trait and group next to each other

    # Create figure with 5 rows and 3 columns
    fig, axes = plt.subplots(5, 3, figsize=(15, 20))

    # Create boxplots for each facet of big5 and group
    sns.boxplot(ax=axes[0, 0], x="group", y="bfi2_e_sociability", data=data)
    sns.boxplot(ax=axes[0, 1], x="group", y="bfi2_e_assertiveness", data=data)
    sns.boxplot(ax=axes[0, 2], x="group", y="bfi2_e_energy_level", data=data)
    sns.boxplot(ax=axes[1, 0], x="group", y="bfi2_a_compassion", data=data)
    sns.boxplot(ax=axes[1, 1], x="group", y="bfi2_a_respectfulness", data=data)
    sns.boxplot(ax=axes[1, 2], x="group", y="bfi2_a_trust", data=data)
    sns.boxplot(ax=axes[2, 0], x="group", y="bfi2_c_organization", data=data)
    sns.boxplot(ax=axes[2, 1], x="group", y="bfi2_c_productiveness", data=data)
    sns.boxplot(ax=axes[2, 2], x="group", y="bfi2_c_responsibility", data=data)
    sns.boxplot(ax=axes[3, 0], x="group", y="bfi2_n_anxiety", data=data)
    sns.boxplot(ax=axes[3, 1], x="group", y="bfi2_n_depression", data=data)
    sns.boxplot(ax=axes[3, 2], x="group", y="bfi2_n_emotional_volatility", data=data)
    sns.boxplot(ax=axes[4, 0], x="group", y="bfi2_o_intellectual_curiosity", data=data)
    sns.boxplot(ax=axes[4, 1], x="group", y="bfi2_o_aesthetic_sensitivity", data=data)
    sns.boxplot(ax=axes[4, 2], x="group", y="bfi2_o_creative_imagination", data=data)

    # Set titles for each facet
    axes[0, 0].set_title("Extraversion - Sociability")
    axes[0, 1].set_title("Extraversion - Assertiveness")
    axes[0, 2].set_title("Extraversion - Energy level")
    axes[1, 0].set_title("Agreeableness - Compassion")
    axes[1, 1].set_title("Agreeableness - Respectfulness")
    axes[1, 2].set_title("Agreeableness - Trust")
    axes[2, 0].set_title("Conscientiousness - Organization")
    axes[2, 1].set_title("Conscientiousness - Productiveness")
    axes[2, 2].set_title("Conscientiousness - Responsibility")
    axes[3, 0].set_title("Neuroticism - Anxiety")
    axes[3, 1].set_title("Neuroticism - Depression")
    axes[3, 2].set_title("Neuroticism - Emotional volatility")
    axes[4, 0].set_title("Openness - Intellectual curiosity")
    axes[4, 1].set_title("Openness - Aesthetic sensitivity")
    axes[4, 2].set_title("Openness - Creative imagination")

plot_boxplots_facets(data)

In [None]:
def plot_gt_counts(data):
    # Plot barplot for each big5 trait and group next to each other

    # Create figure with 5 subplots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Create barplot for each big5 trait and group
    sns.barplot(ax=axes[0], x='group', y='gt_lies', data=data)
    sns.barplot(ax=axes[1], x='group', y='gt_half_truths', data=data)
    sns.barplot(ax=axes[2], x='group', y='gt_truths', data=data)

    # Set titles
    axes[0].set_title('Lies')
    axes[1].set_title('Half-truths')
    axes[2].set_title('Truths')

plot_gt_counts(data)

In [None]:
def plot_ground_truths(data_out):
    # Each big5 answer has a truth value (Lie, Half-truth, Truth), (0, 0.5, 1) respectively
    # Plot occurences of Ground Truths values for each list of big5 questions

    data = data_out.copy(deep=True)

    # Ground truth columns for each big5 trait
    bfi2_e_gt = [x + '_gt' for x in bfi2_e_columns]
    bfi2_a_gt = [x + '_gt' for x in bfi2_a_columns]
    bfi2_c_gt = [x + '_gt' for x in bfi2_c_columns]
    bfi2_n_gt = [x + '_gt' for x in bfi2_n_columns]
    bfi2_o_gt = [x + '_gt' for x in bfi2_o_columns]

    data.insert(0, "bfi2_e_gt", data.loc[:, bfi2_e_gt].mean(axis=1))
    data.insert(0, "bfi2_a_gt", data.loc[:, bfi2_a_gt].mean(axis=1))
    data.insert(0, "bfi2_c_gt", data.loc[:, bfi2_c_gt].mean(axis=1))
    data.insert(0, "bfi2_n_gt", data.loc[:, bfi2_n_gt].mean(axis=1))
    data.insert(0, "bfi2_o_gt", data.loc[:, bfi2_o_gt].mean(axis=1))

    # Create figure with 5 subplots
    fig, axes = plt.subplots(1, 5, figsize=(15, 5))

    # Create boxplots for each big5 trait and group
    sns.boxplot(ax=axes[0], x='group', y='bfi2_e_gt', data=data)
    sns.boxplot(ax=axes[1], x='group', y='bfi2_a_gt', data=data)
    sns.boxplot(ax=axes[2], x='group', y='bfi2_c_gt', data=data)
    sns.boxplot(ax=axes[3], x='group', y='bfi2_n_gt', data=data)
    sns.boxplot(ax=axes[4], x='group', y='bfi2_o_gt', data=data)

    # Set titles
    axes[0].set_title('Extraversion')
    axes[1].set_title('Agreeableness')
    axes[2].set_title('Conscientiousness')
    axes[3].set_title('Neuroticism')
    axes[4].set_title('Openness')

plot_ground_truths(data)


In [None]:
def stats_ground_truths_in_verbal_elaborations(data_out):
    data = data_out.copy(deep=True)

    print(len(data))

    elaboration_columns = ['rbfi4_gt',
                        'rbfi8_gt',
                        'bfi15_gt',
                        'bfi18_gt',
                        'rbfi30_gt',
                        'bfi32_gt',
                        'bfi39_gt',
                        'bfi41_gt',
                        'rbfi51_gt',
                        'bfi52_gt']

    data[elaboration_columns] = data[elaboration_columns].replace(0.5, 1)     

    print(data[elaboration_columns].sum())
    print(data[elaboration_columns].sum().sum())


stats_ground_truths_in_verbal_elaborations(data)