In [None]:
import logging
from datetime import datetime

current_file_name = "17_Graphs_and_Charts"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

import shap

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.questions import *
from helpers.utils import *
from helpers.machine_learning import *

In [None]:
data_path_base = "data\\17_Graphs_and_Charts\\"

## Train Test Split

In [None]:
train_fg_respondents = ['respondent_43', 'respondent_26', 'respondent_35', 'respondent_31', 'respondent_53', 'respondent_21', 'respondent_22', 'respondent_50', 'respondent_42', 'respondent_55', 'respondent_54', 'respondent_16', 'respondent_9', 'respondent_105', 'respondent_37', 'respondent_58', 'respondent_38', 'respondent_51', 'respondent_106', 'respondent_15', 'respondent_52', 'respondent_25', 'respondent_12', 'respondent_56', 'respondent_46', 'respondent_36']
train_h_respondents = ['respondent_8', 'respondent_24', 'respondent_42', 'respondent_17', 'respondent_29', 'respondent_108', 'respondent_30', 'respondent_39', 'respondent_58', 'respondent_10', 'respondent_19', 'respondent_53', 'respondent_45', 'respondent_52', 'respondent_33', 'respondent_16', 'respondent_21', 'respondent_32', 'respondent_23', 'respondent_35', 'respondent_47', 'respondent_48', 'respondent_31', 'respondent_20']
test_fg_respondents = ['respondent_104', 'respondent_18', 'respondent_34', 'respondent_40', 'respondent_45', 'respondent_48', 'respondent_49']
test_h_respondents = ['respondent_107', 'respondent_110', 'respondent_22', 'respondent_27', 'respondent_50', 'respondent_57', 'respondent_9']

def remove_prefix_from_list(list, prefix, variant):
    return [f"{variant}_{x.replace(prefix, '')}" for x in list]

train_fg_respondents = remove_prefix_from_list(train_fg_respondents, "respondent_", "FG")
train_h_respondents = remove_prefix_from_list(train_h_respondents, "respondent_", "H")
test_fg_respondents = remove_prefix_from_list(test_fg_respondents, "respondent_", "FG")
test_h_respondents = remove_prefix_from_list(test_h_respondents, "respondent_", "H")

print(len(train_fg_respondents))
print(len(train_h_respondents))
print(len(test_fg_respondents))
print(len(test_h_respondents))

## Demography

In [None]:
demographics = pd.read_excel("data\\0_Raw_Data\\Respondents.xlsx", sheet_name="Prod")
demographics = demographics[["Variant", "Respondent", "Gender", "Age", "Highest level of education", "Current employment status", "Input device"]]

In [None]:
# Rename columns - translate column names to Slovak
demographics = demographics.rename(columns = {"Variant": "Variant", "Respondent": "Respondent", "Gender": "Pohlavie", "Age": "Vek", "Highest level of education": "Najvyššie dosiahnuté vzdelanie", "Current employment status": "Aktuálny zamestnanecký status", "Input device": "Zariadenie"})

In [None]:
# Concat variant and respondent to create unique identifier
demographics["ID"] = demographics["Variant"] + "_" + demographics["Respondent"].astype(str)

In [None]:
demographics

In [None]:
# Group ages
# Bucket ages into groups
age_bins = [18, 30, 40, 50, 60, 70, 80]
age_labels = ['18-30', '31-40', '41-50', '51-60', '61-70', '71-80']
demographics['Veková skupina'] = pd.cut(demographics['Vek'], bins=age_bins, labels=age_labels, right=False)

In [None]:
rename_dict = {
    "Male": "Muž",
    "Female": "Žena",
    "Non-binary": "Nebinárna osoba",
    "High School": "Stredoškolské",
    "Bachelor's Degree": "Bakalárske",
    "Master's Degree": "Magisterské",
    "Ph.D. or higher": "PhD. a vyššie",
    "Trade School": "Odborné",
    "Other": "Iné",
    "Employed Full-Time": "Plný úväzok",
    "Employed Part-Time": "Čiastočný úväzok",
    "Seeking opportunities": "Nezamesnaný",
    "Retired": "Na dôchodku",
    "Prefer not to say": "Nechcem uviesť",
    "Mouse": "Myš",
    "Touchpad": "Touchpad",
    "Smartphone Touchscreen": "Dotyková obrazovka smartfónu",
    "Tablet Stylus or Touchscreen": "Tabletový stylus alebo dotyková obrazovka",
}

demographics = demographics.replace(rename_dict)

In [None]:
# Custom function to format labels with percentages and respondent counts
def autopct_with_counts(pct, values):
    total = sum(values)
    count = int(pct * total / 100.0)
    return f'{pct:.1f}% ({count})'

def plot_demographics_pie_chart(demographics_out, path_to_save):
    demographics = demographics_out.copy()
    
    # Set up the figure and axes
    fig, axs = plt.subplots(2, 3, figsize=(16, 10))

    # Colors for the pie charts
    # https://coolors.co/palettes/popular/6%20colors
    colors = ["#6667ab", "#f18aad", "#ea6759", "#f88f58", "#f3c65f", "#8bc28c"]

    # Plot each pie chart
    for ax, column in zip(axs.flatten(), ["Variant", "Pohlavie", "Zariadenie", "Najvyššie dosiahnuté vzdelanie", "Veková skupina", "Aktuálny zamestnanecký status"]):
        values = demographics[column].value_counts()
        ax.pie(values, labels=values.index, autopct=lambda pct: autopct_with_counts(pct, values), startangle=25, colors=colors,
                wedgeprops=dict(width=0.3, edgecolor='w'), explode=[0.05] * len(values + 5), pctdistance=0.45, textprops={'fontsize': 12})
        ax.set_title(column, fontsize=16)

    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.2, hspace=0.1)

    # Save the plot to a file
    plt.savefig(path_to_save, bbox_inches='tight')

    # Show the plot
    plt.show()

    # Print mean age and standard deviation
    mean_age = demographics["Vek"].mean()
    std_age = demographics["Vek"].std()
    print(f"Mean age: {mean_age:.1f}")
    print(f"Standard deviation: {std_age:.1f}")

In [None]:
plot_demographics_pie_chart(demographics, data_path_base + "0_demographics_pie_charts_all_data.png")

In [None]:
# Only FG respondents
demographics_fg = demographics[demographics["Variant"] == "FG"]
plot_demographics_pie_chart(demographics_fg, data_path_base + "1_demographics_pie_charts_fg.png")

In [None]:
# Only H respondents
demographics_h = demographics[demographics["Variant"] == "H"]
plot_demographics_pie_chart(demographics_h, data_path_base + "2_demographics_pie_charts_h.png")

In [None]:
# Train data demographics
train_demographics = demographics[demographics["ID"].isin(train_fg_respondents + train_h_respondents)]
len(train_demographics)

In [None]:
plot_demographics_pie_chart(train_demographics, data_path_base + "3_demographics_pie_charts_train_set.png")

In [None]:
# Test data demographics
test_demographics = demographics[demographics["ID"].isin(test_fg_respondents + test_h_respondents)]
len(test_demographics)

In [None]:
plot_demographics_pie_chart(test_demographics, data_path_base + "4_demographics_pie_charts_test_set.png")

## Big 5

In [None]:
paired_data = pd.read_csv("data\\4_Pair_UXtweak_and_SurveyJS\\4_Pair_UXtweak_and_SurveyJS_data.csv")

In [None]:
# Concat group_evaluated and order to create unique identifier
paired_data["ID"] = paired_data["group_evaluated"] + "_" + paired_data["order"].astype(str)

In [None]:
paired_data

In [None]:
paired_data.groupby("group_evaluated").count()

### Advanced analytics

In [None]:
aa_paired_data = paired_data[['group_evaluated',
 'total_points',
 'mean_points',
 'gt_lies',
 'gt_half_truths',
 'gt_truths',
 'mean_points_check',
 'bfi2_e',
 'bfi2_a',
 'bfi2_c',
 'bfi2_n',
 'bfi2_o',
 'bfi2_e_sociability',
 'bfi2_e_assertiveness',
 'bfi2_e_energy_level',
 'bfi2_a_compassion',
 'bfi2_a_respectfulness',
 'bfi2_a_trust',
 'bfi2_c_organization',
 'bfi2_c_productiveness',
 'bfi2_c_responsibility',
 'bfi2_n_anxiety',
 'bfi2_n_depression',
 'bfi2_n_emotional_volatility',
 'bfi2_o_intellectual_curiosity',
 'bfi2_o_aesthetic_sensitivity',
 'bfi2_o_creative_imagination',]]
aa_paired_data

In [None]:
aa_paired_data[aa_paired_data['group_evaluated'] == "FG"].count()

In [None]:
aa_paired_data['is_fg_group'] = aa_paired_data["group_evaluated"] == "FG"
aa_paired_data[aa_paired_data['is_fg_group'] == True].count()

In [None]:
categorical_cols, continuous_cols = detect_categorical_columns(aa_paired_data)
categorical_cols

In [None]:
aa_categorical_features = []
aa_target = "is_fg_group"
aa_remove = ["group_evaluated", aa_target, *aa_categorical_features]
aa_continuous_features = [f for f in aa_paired_data.columns if f not in aa_remove]
aa_continuous_features

In [None]:
aa_path = 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\17_Graphs_and_Charts\\aa_big5_paired_data.xlsx'

In [None]:
calculate_advanced_descriptive_stats(aa_target, aa_continuous_features, aa_categorical_features, aa_paired_data, aa_path)

In [None]:
def plot_boxplots(data_out, path_to_save):
    data = data_out.copy(deep=True)
    
    # Plot boxplots for each big5 trait and group next to each other

    # Invert bfi2_n values
    data['bfi2_n'] = 6 - data['bfi2_n']
    
    # Create figure with 5 subplots
    fig, axes = plt.subplots(1, 5, figsize=(14, 8))

    # Define colors for each group
    palette = {"FG": "#6667ab", "H": "#f18aad"}

    # Define big5 traits
    traits = ['bfi2_e', 'bfi2_a', 'bfi2_c', 'bfi2_n', 'bfi2_o']

    # Create boxplots for each big5 trait and group
    sns.boxplot(ax=axes[0], x='group_evaluated', y='bfi2_e', data=data, palette=palette, hue='group_evaluated')
    sns.boxplot(ax=axes[1], x='group_evaluated', y='bfi2_a', data=data, palette=palette, hue='group_evaluated')
    sns.boxplot(ax=axes[2], x='group_evaluated', y='bfi2_c', data=data, palette=palette, hue='group_evaluated')
    sns.boxplot(ax=axes[3], x='group_evaluated', y='bfi2_n', data=data, palette=palette, hue='group_evaluated')
    sns.boxplot(ax=axes[4], x='group_evaluated', y='bfi2_o', data=data, palette=palette, hue='group_evaluated')

    # Set titles
    axes[0].set_title('Extraverzia', fontsize=16)
    axes[1].set_title('Prívetivosť', fontsize=16)
    axes[2].set_title('Svedomitosť', fontsize=16)
    axes[3].set_title('Negatívna emocionalita', fontsize=16)
    axes[4].set_title('Otvorenosť', fontsize=16)

    # Set y-axis limits and remove y-axis labels
    for i, ax in enumerate(axes):
        ax.set_ylim(0.75, 5.25)
        ax.set_ylabel('') # Disable y-axis label
        ax.set_xlabel('') # Disable x-axis label
        axes[i].tick_params(axis='both', which='major', labelsize=12)  # Set tick labels size to 12

        if i != 0:
            ax.set_yticks([])  # Disable y-axis ticks for all except the first

        # Calculate and annotate metrics
        group_metrics = {}
        for j, group in enumerate(data['group_evaluated'].unique()):
            group_data = data[data['group_evaluated'] == group][traits[i]]
            median = group_data.median()
            mean = group_data.mean()
            q1 = group_data.quantile(0.25)
            q3 = group_data.quantile(0.75)
            minimum = q1 - 1.5 * (q3 - q1)
            maximum = q3 + 1.5 * (q3 - q1)

            minimum_sns = max(minimum, group_data.min())
            maximum_sns = min(maximum, group_data.max())

            if group == "H":
                group = "H  "

            group_metrics[group] = {"Median": median, "Mean": mean, "Q1": q1, "Q3": q3, "Min": minimum_sns, "Max": maximum_sns}

        combined_text = '   '.join([f"Median: {metrics['Median']:.2f} " for group, metrics in group_metrics.items()])
        combined_text += '\n' + '   '.join([f"Mean: {metrics['Mean']:.2f} " for group, metrics in group_metrics.items()])
        combined_text += '\n' + '   '.join([f"Q1: {metrics['Q1']:.2f} " for group, metrics in group_metrics.items()])
        combined_text += '\n' + '   '.join([f"Q3: {metrics['Q3']:.2f} " for group, metrics in group_metrics.items()])
        combined_text += '\n' + '   '.join([f"Min: {metrics['Min']:.2f} " for group, metrics in group_metrics.items()])
        combined_text += '\n' + '   '.join([f"Max: {metrics['Max']:.2f} " for group, metrics in group_metrics.items()])

        axes[i].annotate(combined_text, xy=(0.5, -0.07), xycoords='axes fraction', ha='center', va='top', fontsize=12)

    # Adjust layout
    plt.tight_layout()

    # Save the plot to a file
    plt.savefig(path_to_save, bbox_inches='tight')

    plt.show()

In [None]:
plot_boxplots(paired_data, data_path_base + "5_boxplots_all_data.png")

In [None]:
# Train data boxplots
train_paired_data = paired_data[paired_data["ID"].isin(train_fg_respondents + train_h_respondents)]
len(train_paired_data)

In [None]:
plot_boxplots(train_paired_data, data_path_base + "6_boxplots_train_set.png")

In [None]:
# Test data boxplots
test_paired_data = paired_data[paired_data["ID"].isin(test_fg_respondents + test_h_respondents)]
len(test_paired_data)

In [None]:
plot_boxplots(test_paired_data, data_path_base + "7_boxplots_test_set.png")

## Lies

In [None]:
paired_data

In [None]:
def plot_gt_counts(data_out, path_to_save):
    data = data_out.copy(deep=True)

    # Plot barplot for each big5 trait and group next to each other

    # Create figure with 5 subplots
    fig, axes = plt.subplots(1, 3, figsize=(12, 5))

    # Define colors for each group
    palette = {"FG": "#6667ab", "H": "#f18aad"}

    # Create barplot for each big5 trait and group
    sns.barplot(ax=axes[0], x='group_evaluated', y='gt_lies', data=data, palette=palette, hue='group_evaluated')
    sns.barplot(ax=axes[1], x='group_evaluated', y='gt_half_truths', data=data, palette=palette, hue='group_evaluated')
    sns.barplot(ax=axes[2], x='group_evaluated', y='gt_truths', data=data, palette=palette, hue='group_evaluated')

    # Set titles
    axes[0].set_title('Lži', fontsize=16)
    axes[1].set_title('Polopravdy', fontsize=16)
    axes[2].set_title('Pravdy', fontsize=16)

    # Set y-axis limits and remove y-axis labels
    for i, ax in enumerate(axes):
        ax.set_ylabel('') # Disable y-axis label
        ax.set_xlabel('') # Disable x-axis label
        axes[i].tick_params(axis='both', which='major', labelsize=12)  # Set tick labels size to 12

    # Adjust layout
    plt.tight_layout()

    # Save the plot to a file
    plt.savefig(path_to_save, bbox_inches='tight')

    # Show the plot
    plt.show()

In [None]:
plot_gt_counts(paired_data, data_path_base + "8_gt_counts_all_data.png")

In [None]:
plot_gt_counts(train_paired_data, data_path_base + "9_gt_counts_train_set.png")

In [None]:
plot_gt_counts(test_paired_data, data_path_base + "10_gt_counts_test_set.png")

## Voice metrics

In [None]:
# Load train and test data
voice_metrics_train = pd.read_csv("data\\12_PDU_Aggregations_and_Models\\datasets\\20240514_160028_train.csv")
voice_metrics_test = pd.read_csv("data\\12_PDU_Aggregations_and_Models\\datasets\\20240514_160028_test.csv")
voice_metrics_preprocessed = pd.concat([voice_metrics_train, voice_metrics_test])

In [None]:
show_box_boxwithout_hist('indicator_fg', ['word_speach_rate', 'personal_pronoun', 'medianF0Hz'], voice_metrics_preprocessed, kde=True, path_to_save=data_path_base + "11_boxplots_voice_metrics.png")

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

# Remove object columns
voice_metrics_preprocessed_matrix_df = voice_metrics_preprocessed.select_dtypes(exclude=['object'])

df_corr = voice_metrics_preprocessed_matrix_df[voice_metrics_preprocessed_matrix_df.columns].corr()
# Select 5 most correlated features with the target
cols = df_corr.nlargest(5, 'indicator_fg')['indicator_fg'].index
cm = np.corrcoef(voice_metrics_preprocessed_matrix_df[cols].values.T)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.savefig(data_path_base + "12_heatmap_voice_metrics.png", bbox_inches='tight')
plt.show()

## Mouse metrics

In [None]:
# Load train and test data
mouse_metrics_train = pd.read_csv("data\\14_Mouse_Model\\datasets\\20240514_160051_train.csv")
mouse_metrics_test = pd.read_csv("data\\14_Mouse_Model\\datasets\\20240514_160051_test.csv")
mouse_metrics_preprocessed = pd.concat([mouse_metrics_train, mouse_metrics_test])

In [None]:
show_box_boxwithout_hist('indicator_fg', ['number_of_x_flips', 'distance_y', 'area_difference'], mouse_metrics_preprocessed, kde=True, path_to_save=data_path_base + "13_boxplots_mouse_metrics.png")

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

# Remove object columns
mouse_metrics_preprocessed_matrix_df = mouse_metrics_preprocessed.select_dtypes(exclude=['object'])

df_corr = mouse_metrics_preprocessed_matrix_df[mouse_metrics_preprocessed_matrix_df.columns].corr()
# Select 5 most correlated features with the target
cols = df_corr.nlargest(5, 'indicator_fg')['indicator_fg'].index
cm = np.corrcoef(mouse_metrics_preprocessed_matrix_df[cols].values.T)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.savefig(data_path_base + "14_heatmap_mouse_metrics.png", bbox_inches='tight')
plt.show()