### T-Testing ARAUS

**Remarkable Groups:**
- Pleasant = 5
- Eventful = 5
- Vibrant = 5

**Comparison Group** 
- Pleasant = 1


In [1]:
# Setup
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from scipy.stats import ttest_ind

file_path = "sorted_affective.csv"
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,participant,fold_r,soundscape,masker,smr,stimulus_index,time_taken,is_attention,pleasant,eventful,...,M04000_0_r,M05000_0_r,M06300_0_r,M08000_0_r,M10000_0_r,M12500_0_r,M16000_0_r,M20000_0_r,Leq_L_r,Leq_R_r
0,ARAUS_00009,4,R0087_segment_binaural_44100_1.wav,silence_00004.wav,3,28,35.592,0,5,5,...,46.13,40.68,38.51,33.42,25.83,21.02,20.67,22.7,73.761966,75.353091
1,ARAUS_00021,1,R0081_segment_binaural_44100_2.wav,traffic_00029.wav,0,4,37.439,0,5,5,...,47.66,42.56,41.86,42.69,37.15,36.5,30.99,19.27,74.202644,73.4938
2,ARAUS_00021,1,R0046_segment_binaural_44100_2.wav,bird_00047.wav,3,8,37.833,0,5,5,...,43.73,39.67,35.29,33.46,27.51,19.27,18.67,12.37,67.246896,68.127026
3,ARAUS_00021,1,R0080_segment_binaural_44100_2.wav,traffic_00006.wav,3,11,33.782,0,5,5,...,44.57,43.3,43.81,36.82,31.24,28.05,23.03,17.66,67.395837,68.006605
4,ARAUS_00021,1,R0115_segment_binaural_44100_1.wav,bird_00059.wav,0,12,37.663,0,5,5,...,37.86,31.16,25.78,19.9,16.79,13.98,14.23,12.36,65.505416,66.575806


In [2]:
# Function to create groups with filters
def create_group(data : pd.DataFrame, filter_cols : list, filter_vals: list) -> pd.DataFrame:
    assert len(filter_cols) == len(
        filter_vals
    ), "Filter columns and values must have the same length"

    group_data = data.copy()
    for col, val in zip(filter_cols, filter_vals):
        group_data = group_data[group_data[col] == val]
    return group_data


# Define groups with filters
remarkable_groups = {
    "R_pleasant_5": {"filter_cols": ["pleasant"], "filter_vals": [5]},
    "R_eventful_5": {"filter_cols": ["eventful"], "filter_vals": [5]},
    "R_vibrant_5": {"filter_cols": ["vibrant"], "filter_vals": [5]},
}

comparison_groups = {
    "C_pleasant_1": {"filter_cols": ["pleasant"], "filter_vals": [1]},
}

columns_of_interest = {
    "Savg_r": "Average Sharpness (acum)",
    "Smax_r": "Peak Sharpness (acum)",
    "Navg_r": "Average Loudness (sone)",
    "Nmax_r": "Peak Loudness (sone)",
    "Favg_r": "Average Fluctuation Strength (vacil)",
    "Fmax_r": "Peak Fluctuation Strength (vacil)",
    "Ravg_r": "Average Roughness (asper)",
    "Rmax_r": "Peak Roughness (asper)",
    "Tavg_r": "Average Tonality (tonality units)",
    "Tmax_r": "Peak Tonality (tonality units)",
}

In [24]:
def process_groups(groups: dict, data: pd.DataFrame, columns: list) -> dict:
    """
    Creates groups from list of filters

    RETURNS: dict of "group names" : [relevant group data]
    """
    group_data = {}
    # items are the key value pairs of each group in dict
    for group_name, group_info in groups.items():
        # Create each group using the provided filters
        single_group_datum = create_group(
            data, group_info["filter_cols"], group_info["filter_vals"]
        )
        # Calculate statistics for each group and store them
        group_data[group_name] = single_group_datum[columns]
    return group_data

# Testing
def calculate_statistics(group: pd.DataFrame):
    """Calculate mean, standard deviation and size all columns in a group"""
    return {
        "mean": group.mean(),
        "standard deviation": group.std(),
        "group size": len(group),
        "variance": group.var(),
        # Add more statistics as needed
    }

def process_statistics(group_data: dict):
    """Calculate statistics for all groups in a dictionary"""
    group_stats = {}
    for group_name, group in group_data.items():
        group_stats[group_name] = calculate_statistics(group)
    return group_stats

def significance_test(group_one_stats: dict, group_two_stats: dict):
    """Calculate the t-statistic and p-value for the two given groups"""
    mean_1, mean_2 = group_one_stats["mean"], group_two_stats["mean"]
    std_1, std_2 = group_one_stats["standard deviation"], group_two_stats["standard deviation"]
    size_1, size_2 = group_one_stats["group size"], group_two_stats["group size"]

    # Calculate the t-statistic and p-value
    t_statistic, p_value = ttest_ind(
        mean_1, mean_2, equal_var=False, nan_policy="omit"
    )

col_interest_titles = list(columns_of_interest.keys())

# # Process all groups and store their statistics
remarkable_group_data = process_groups(remarkable_groups, data, col_interest_titles)
comparison_group_data = process_groups(comparison_groups, data, col_interest_titles)
remarkable_group_stats = process_statistics(remarkable_group_data)
comparison_group_stats = process_statistics(comparison_group_data)

# print(remarkable_group_stats)
# print(comparison_group_stats)

{'R_pleasant_5': {'mean': Savg_r     1.401446
Smax_r     1.895222
Navg_r    12.932157
Nmax_r    20.431250
Favg_r     0.021601
Fmax_r     0.107063
Ravg_r     0.027561
Rmax_r     0.080859
Tavg_r     0.234625
Tmax_r     1.354519
dtype: float64, 'standard deviation': Savg_r    0.271485
Smax_r    0.416037
Navg_r    5.630724
Nmax_r    9.468881
Favg_r    0.020273
Fmax_r    0.052535
Ravg_r    0.007402
Rmax_r    0.050241
Tavg_r    0.263747
Tmax_r    0.908111
dtype: float64, 'group size': 2536, 'variance': Savg_r     0.073704
Smax_r     0.173087
Navg_r    31.705049
Nmax_r    89.659715
Favg_r     0.000411
Fmax_r     0.002760
Ravg_r     0.000055
Rmax_r     0.002524
Tavg_r     0.069563
Tmax_r     0.824665
dtype: float64}, 'R_eventful_5': {'mean': Savg_r     1.407588
Smax_r     1.898087
Navg_r    22.280359
Nmax_r    35.780414
Favg_r     0.030152
Fmax_r     0.131155
Ravg_r     0.035263
Rmax_r     0.105956
Tavg_r     0.483619
Tmax_r     2.280637
dtype: float64, 'standard deviation': Savg_r     0.20946

In [13]:
def select_group(group_stats: dict, group_name: str) -> dict:
    """
    Selects a group from the given group_stats dictionary (which contains at least 1 group) and returns it.
    """
    return {group_name: group_stats[group_name]}

def print_group_stats(group_stats: dict, columns_of_interest: dict):
    """
    Prints the statistics of each group in a formatted manner, using the labels from group_stats.
    """
    # Decide which statistics to print; select from list in calculate_statistics()
    printable_stats = ["mean", "standard deviation", "variance"]
    for group_name, stats in group_stats.items():
        group_size = stats["group size"]
        print(f"### Group: {group_name} - size: {group_size}")
        for col_key, label in columns_of_interest.items():
            for stat_key, stat_value in stats.items():
                if stat_key in printable_stats:
                    # Check if stat_value is a DataFrame or series
                    if isinstance(stat_value, (pd.DataFrame, pd.Series)):
                        formatted_label = f"{stat_key.capitalize()} {label}"
                        print(f"    - **{formatted_label}**: {stat_value[col_key]:.4f}")
                    else:  # For non-DataFrame/Series statistics (like 'size')
                        if stat_key != "group size":
                            formatted_label = f"{stat_key.capitalize()} of {group_name}"
                            print(f"    - **{formatted_label}**: {stat_value}")
            print()

        print("\n")


print_group_stats(select_group(remarkable_group_stats, "R_pleasant_5"), columns_of_interest)

# print_group_stats(remarkable_group_stats, columns_of_interest)
print_group_stats(comparison_group_stats, columns_of_interest)

### Group: R_pleasant_5 - size: 2536
    - **Mean Average Sharpness (acum)**: 1.4014
    - **Standard deviation Average Sharpness (acum)**: 0.2715

    - **Mean Peak Sharpness (acum)**: 1.8952
    - **Standard deviation Peak Sharpness (acum)**: 0.4160

    - **Mean Average Loudness (sone)**: 12.9322
    - **Standard deviation Average Loudness (sone)**: 5.6307

    - **Mean Peak Loudness (sone)**: 20.4313
    - **Standard deviation Peak Loudness (sone)**: 9.4689

    - **Mean Average Fluctuation Strength (vacil)**: 0.0216
    - **Standard deviation Average Fluctuation Strength (vacil)**: 0.0203

    - **Mean Peak Fluctuation Strength (vacil)**: 0.1071
    - **Standard deviation Peak Fluctuation Strength (vacil)**: 0.0525

    - **Mean Average Roughness (asper)**: 0.0276
    - **Standard deviation Average Roughness (asper)**: 0.0074

    - **Mean Peak Roughness (asper)**: 0.0809
    - **Standard deviation Peak Roughness (asper)**: 0.0502

    - **Mean Average Tonality (tonality units)**: