### T-Testing ARAUS

We'll start by setting up our dataset

In [80]:
# Setup
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

file_path = "datasets/ARAUS_precleaned.csv"
data = pd.read_csv(file_path)
print(len(data["participant"].unique()), "unique participants")

data.head()


749 unique participants


Unnamed: 0,participant,fold_r,soundscape,masker,smr,stimulus_index,time_taken,is_attention,pleasant,eventful,...,M04000_0_r,M05000_0_r,M06300_0_r,M08000_0_r,M10000_0_r,M12500_0_r,M16000_0_r,M20000_0_r,Leq_L_r,Leq_R_r
0,ARAUS_00009,4,R0087_segment_binaural_44100_1.wav,silence_00004.wav,3,28,35.592,0,5,5,...,46.13,40.68,38.51,33.42,25.83,21.02,20.67,22.7,73.761966,75.353091
1,ARAUS_00021,1,R0081_segment_binaural_44100_2.wav,traffic_00029.wav,0,4,37.439,0,5,5,...,47.66,42.56,41.86,42.69,37.15,36.5,30.99,19.27,74.202644,73.4938
2,ARAUS_00021,1,R0046_segment_binaural_44100_2.wav,bird_00047.wav,3,8,37.833,0,5,5,...,43.73,39.67,35.29,33.46,27.51,19.27,18.67,12.37,67.246896,68.127026
3,ARAUS_00021,1,R0080_segment_binaural_44100_2.wav,traffic_00006.wav,3,11,33.782,0,5,5,...,44.57,43.3,43.81,36.82,31.24,28.05,23.03,17.66,67.395837,68.006605
4,ARAUS_00021,1,R0115_segment_binaural_44100_1.wav,bird_00059.wav,0,12,37.663,0,5,5,...,37.86,31.16,25.78,19.9,16.79,13.98,14.23,12.36,65.505416,66.575806


Below, we define all the values we will use for our analysis

### DEFINITIONS

In [95]:
#Define which columns from cleaned_ARAUS.csv to use for the analysis
# Format: {col_name in dataset: column name used for printing results}
columns_of_interest = {
    "Savg_r": "Average Sharpness (acum)",
    "Smax_r": "Peak Sharpness (acum)",
    "Navg_r": "Average Loudness (sone)",
    "Nmax_r": "Peak Loudness (sone)",
    "Favg_r": "Average Fluctuation Strength (vacil)",
    "Fmax_r": "Peak Fluctuation Strength (vacil)",
    "Ravg_r": "Average Roughness (asper)",
    "Rmax_r": "Peak Roughness (asper)",
    "Tavg_r": "Average Tonality (tonality units)",
    "Tmax_r": "Peak Tonality (tonality units)",
}
# Define which columns from ARAUS_cleaned.csv to include in ARAUS_relevant.csv
necessary_context = ["participant", "soundscape", "masker", "time_taken"]
necessary_affective = ["pleasant", "eventful", "chaotic", "vibrant", "uneventful", "calm", "annoying", "monotonous"]
relevant_data = data.loc[:, [*necessary_context, *necessary_affective, *list(columns_of_interest.keys())]]

# Write ARAUS csv with only relevant columns to new csv
relevant_data.to_csv("datasets/ARAUS_relevant.csv", index=False)


# TO GROUP OUR DATA:
# Combine participant strings for each soundscape and masker
# Count number of participants combined for each soundscape and masker
# Mean every numerical column (time taken, all afffective ratings, all acoustic features)

# Prime a new column for counting the rows grouped together
relevant_data.insert(0, "merge_count", 1)

# Create aggregations dictionary to define aggregation logic for each column
aggregations = {
    "participant": lambda x: ', '.join(x),
    "merge_count": "sum",
}

# Select only the columns that contain numerical data (and can be averaged)
numeric_cols = list(relevant_data.select_dtypes(include=['Float64', 'Int64']).columns)

for col_name in numeric_cols:
    if col_name not in aggregations:
        aggregations[col_name] = "mean"

merged_data = relevant_data.groupby(["soundscape", "masker"]).agg(aggregations)
print(f"Data compressed from {len(relevant_data)} entries to {len(merged_data)} entries in the merged table.")
print("Average grouping size:", merged_data['merge_count'].mean().round(3), "soundscapes.")
print("Largest group merger count: {size:.1f} soundscapes".format(size=merged_data['merge_count'].max()))
print("\nGroup size distribution:")
print(merged_data["merge_count"].value_counts().sort_index(), "\n")
# merged_data.sort_values(by=["merge_count"], ascending=False).head()

Data compressed from 32232 entries to 15729 entries in the merged table.
Average grouping size: 2.049 soundscapes.
Largest group merger count: 136.0 soundscapes

Group size distribution:
merge_count
1      7696
2      4600
3      3048
4        32
5       112
19       16
20      129
21       82
22        7
32        2
136       5
Name: count, dtype: int64 



In [108]:
class Group:
    def __init__(self, remarkable: bool, short_hand: str, filter_cols: list, percentile: float, data: pd.DataFrame = merged_data):
        self.remarkable = remarkable
        self.short_hand = short_hand
        self.filter_cols = filter_cols
        self.percentile = percentile
        self.data = data  # DataFrame to perform analysis on
        self.descriptive_name = self.create_extended_name()
        self.filtered_data = self.create_filtered_group()  # Store the filtered data as an attribute
    
    def create_extended_name(self):
        """Creates the descriptive name saying top/bottom {percentile} of *filter_cols."""
        prefix = "top" if self.percentile > 0.5 else "tottom"
        readable_percentile = round(100 * (1 - self.percentile if self.percentile > 0.5 else self.percentile))
        filters_description = " & ".join(self.filter_cols)
        return f"{prefix} {readable_percentile}% of {filters_description} soundscapes"

    def create_filtered_group(self):
        """Filters the DataFrame based on provided filters and percentile, storing the result."""
        assert 0 <= self.percentile <= 1, "Percentile must be between 0 and 1"

        ascend = self.percentile < 0.5
        
        # Sort and filter the DataFrame
        group_data = self.data.sort_values(by=self.filter_cols + ["merge_count"], ascending=[ascend] * len(self.filter_cols) + [False])

        slice_index = int(len(group_data) * self.percentile)
        return group_data.iloc[:slice_index, :]

    def __str__(self):
        return self.descriptive_name

# Remarkable groups
# High percentile (.9) means the TOP 10% of the data
# Order by which groups sorting will be prioritized (ex: if pleasant and vibrant listed, pleasant will be sorted first, then vibrant)
remarkable_groups = [
    Group(True, "R_pleasant_90", ["pleasant", "vibrant"], 0.9),
    Group(True, "R_eventful_90", ["eventful"], 0.9),
    Group(True, "R_vibrant_90", ["vibrant"], 0.9),
]

# Comparison groups
# Low percentile (10) means BOTTOM 10% of the data
comparison_groups = [
    Group(False, "C_pleasant_10", ["pleasant"], 0.1),
    Group(False, "C_eventful_10", ["eventful"], 0.1),
    Group(False, "C_vibrant_10", ["vibrant"], 0.1),
]

# Printing for testing
print(remarkable_groups[0])
# print(remarkable_groups[0].filtered_data.head())



top 10% of pleasant & vibrant soundscapes


In [94]:
def create_group(data : pd.DataFrame, filter_cols : list, percentile: float) -> pd.DataFrame:
    """
    Creates a dataframe of a group of data based on the provided filters and a percentile
    
    Args:
        data: the main, large, dataframe to filter
        filter_cols: a list of columns to filter the data by
        percentile: a value between 0 and 1. Determines the cutoff point for the group

    Percentiles above 50% are treated as "high" percentiles, and only the data above the percentile is kept
    Conversely, percentiles below 50% are treated as "low" percentiles, and only the data below the percentile is kept

    Returns:
        pd.DataFrame - A dataframe of the sorted and trimmed group data
    """
    assert 0 <= percentile <= 1, "Percentile must be between 0 and 1"

    ascend = False
    if percentile < 0.5:
        ascend = True

    print([[ascend] * len(filter_cols), False])
    # Prioritizes larger groups (more representative data, but only really has an effect when there's a single filter)
    group_data = data.copy().sort_values(by=[*filter_cols, "merge_count"], ascending=[ascend] * len(filter_cols) + [False])
    slice_index = int(len(group_data) * percentile)
    group_data = group_data.iloc[:slice_index, :]

    return group_data

# print(create_group(merged_data, ["pleasant", "eventful"], 0.99).head())

def process_groups(groups: dict, data: pd.DataFrame, columns: list) -> dict:
    """
    Creates groups from list of filters

    Returns:
      dict - "group names" : [relevant group data]
    """
    group_data = {}
    # items are the key value pairs of each group in dict
    for group_name, group_info in groups.items():
        # Create each group using the provided filters
        single_group_datum = create_group(
            data, group_info["filter_cols"], group_info["filter_vals"]
        )
        # Calculate statistics for each group and store them
        group_data[group_name] = single_group_datum[columns]

        # Write group data to csv
        if group_name[0] == "R":
            path = f"datasets/remarkable_groups/{group_name}.csv"
        else:
            path = f"datasets/comparison_groups/{group_name}.csv"

        single_group_datum[necessary_context + columns].to_csv(path, index=False)
    return group_data


def calculate_statistics(group: pd.DataFrame):
    """Calculate mean, standard deviation and size all columns in a group"""
    return {
        "mean": group.mean(),
        "standard deviation": group.std(),
        "group size": len(group),
        "variance": group.var(),
        # Add more statistics as needed
    }

def process_statistics(group_data: dict):
    """Calculate statistics for all groups in a dictionary"""
    group_stats = {}
    for group_name, group in group_data.items():
        group_stats[group_name] = calculate_statistics(group)
    return group_stats


col_interest_titles = list(columns_of_interest.keys())

# # Process all groups and store their statistics
remarkable_group_data = process_groups(remarkable_groups, data, col_interest_titles)
comparison_group_data = process_groups(comparison_groups, data, col_interest_titles)
remarkable_group_stats = process_statistics(remarkable_group_data)
comparison_group_stats = process_statistics(comparison_group_data)

# print(remarkable_group_stats)
# print(comparison_group_stats)

[[False, False], False]
                                                    participant  merge_count  \
soundscape                         masker                                      
R0011_segment_binaural_44100_1.wav water_00057.wav  ARAUS_00271            1   
R0011_segment_binaural_44100_2.wav bird_00028.wav   ARAUS_00436            1   
                                   water_00071.wav  ARAUS_00326            1   
R0012_segment_binaural_44100_2.wav bird_00008.wav   ARAUS_00208            1   
R0018_segment_binaural_44100_2.wav bird_00055.wav   ARAUS_00446            1   

                                                    time_taken  pleasant  \
soundscape                         masker                                  
R0011_segment_binaural_44100_1.wav water_00057.wav      35.953       5.0   
R0011_segment_binaural_44100_2.wav bird_00028.wav       63.752       5.0   
                                   water_00071.wav      51.469       5.0   
R0012_segment_binaural_44100_2.wav 

KeyError: 'filter_vals'

In [68]:
def select_group(group_data: dict, group_name: str) -> dict:
    """
    Selects a group from the given group_data dictionary (which contains at least 1 group) and returns 
    a dictionary containing only the selected group's statistics or data.

    Works taking in both group_data and group_stats dictionaries.
    """
    return {group_name: group_data[group_name]}

def print_group_stats(group_stats: dict, columns_of_interest: dict, printable_stats: list = ["mean", "standard deviation", "group size"]):
    """
    Prints the statistics of each group in a formatted manner, using the labels from group_stats.
    """
    for group_name, stats in group_stats.items():
        group_size = stats["group size"]
        print(f"### Group: {group_name} - size: {group_size}")
        for col_key, label in columns_of_interest.items():
            for stat_key, stat_value in stats.items():
                if stat_key in printable_stats:
                    # Check if stat_value is a DataFrame or series
                    if isinstance(stat_value, (pd.DataFrame, pd.Series)):
                        formatted_label = f"{stat_key.capitalize()} {label}"
                        print(f"    - **{formatted_label}**: {stat_value[col_key]:.4f}")
                    else:  # For non-DataFrame/Series statistics (like 'size')
                        if stat_key != "group size":
                            formatted_label = f"{stat_key.capitalize()} of {group_name}"
                            print(f"    - **{formatted_label}**: {stat_value}")
            print()

        print("\n")



In [69]:
from scipy.stats import ttest_ind


def significance_test(one_data: pd.DataFrame, two_data: pd.DataFrame) -> tuple:
    """
    Calculate the t-statistic and p-value for the two given groups.

    Args:
        one_data (pd.DataFrame): Dataframe containing data for the first group (with only relevant columns).
        two_data (pd.DataFrame): Dataframe containing data for the second group.

    Returns:
        tuple: A tuple of two dictionaries containing t-statistics and p-values for each column.
    """

    # Initializing dictionaries for results
    t_statistics = {}
    p_values = {}

    # Iterating over each statistic and calculating t-statistic and p-value
    for stat_name in one_data.columns:
        t_statistic, p_value = ttest_ind(
            one_data[stat_name], 
            two_data[stat_name], 
            equal_var=False, 
            nan_policy="omit"
        )
        t_statistics[stat_name] = t_statistic
        p_values[stat_name] = p_value

    return t_statistics, p_values


def verbose_compare_groups(
    group_one_data: dict, 
    group_two_data: dict,
    columns_of_interest: dict, 
    file,
    test_p: float = 0.01, 
    include_insignificant: bool = True
):
    """
    Compares two groups, prints the comparison of their means, and calculates the t-test for significant difference in means.

    Args:
        group_one_data (dict): Dictionary containing statistics for the first group.
        group_two_data (dict): Dictionary containing statistics for the second group.
        columns_of_interest (dict): Dictionary mapping data column keys to their descriptive labels.
        file (Python I/O file object): Open file that can be written to 
        test_p (float): Value for testing significance of t-tests (typically 0.05 or 0.01)
        include_insignificant (bool): Whether or not to include insignificant t-tests in the output
    """

    # Retrieve group names
    group_one_name, group_two_name = list(group_one_data.keys())[0], list(group_two_data.keys())[0]
    one_data, two_data = list(group_one_data.values())[0], list(group_two_data.values())[0]

    file.write(f"## COMPARISON BETWEEN {group_one_name} AND {group_two_name} GROUPS\n\n")

    means_one = calculate_statistics(one_data)['mean']
    means_two = calculate_statistics(two_data)['mean']

    # Perform t-test and compare means
    t_stats, p_values = significance_test(one_data, two_data)

    underscore_counter = 0

    # Write comparison of means and summary of t-tests
    for col_key, label in columns_of_interest.items():
        group_one_stat = means_one[col_key]
        group_two_stat = means_two[col_key]

        higher_lower = "**HIGHER**" if group_one_stat > group_two_stat else "**LOWER**"
        inverse_higher_lower = "**LOWER**" if higher_lower == "**HIGHER**" else "**HIGHER**"

        file.write(f"### PARAMETER: {label}\n")
        file.write(f"- *{group_one_name}* MEAN: {group_one_stat:.4f} - {higher_lower}\n")
        file.write(f"- *{group_two_name}* MEAN: {group_two_stat:.4f} - {inverse_higher_lower}\n")

        significance = "**STATISTICALLY SIGNIFICANT**" if p_values[col_key] < test_p else "NOT A STATISTICALLY SIGNIFICANT"
        if include_insignificant or p_values[col_key] < test_p:
            file.write(f"> {significance} DIFFERENCE WITH P={test_p}: p-value: {p_values[col_key]:.4f}, t-value: {t_stats[col_key]:.4f}\n\n")

        underscore_counter += 1
        if underscore_counter % 2 == 0 and underscore_counter != 10:
            file.write("-------------------\n")


# Compare all remarkable and comparison groups, output to total_comparisons.md file
with open("t-test-outputs/total_comparisons.md", "w") as file:
    # Generate table of contents
    file.write("# Table of Contents\n")
    for r_group in remarkable_group_data:
        for c_group in comparison_group_data:
            file.write(f"- [{r_group} vs. {c_group}](#comparison-between-{r_group.lower()}-and-{c_group.lower()}-groups)\n")
    file.write("\n")
    for r_group in remarkable_group_data:
        for c_group in comparison_group_data:
            verbose_compare_groups(
                select_group(remarkable_group_data, r_group),
                select_group(comparison_group_data, c_group),
                columns_of_interest,
                file
            )
        file.write("______________________________________</br></br></br></br>\n\n")


# Compare corresponding remarkable and comparison groups (pleasant vs. pleasant, vibrant vs. vibrant, etc.)
with open("t-test-outputs/corresponding_comparisons.md", "w") as file:
    file.write("# Table of Contents\n")
    for r_group, c_group in zip(remarkable_group_data, comparison_group_data):
        file.write(f"- [{r_group} vs. {c_group}](#comparison-between-{r_group.lower()}-and-{c_group.lower()}-groups)\n")
    file.write("\n")
    for r_group, c_group in zip(remarkable_group_data, comparison_group_data):
        verbose_compare_groups(
            select_group(remarkable_group_data, r_group),
            select_group(comparison_group_data, c_group),
            columns_of_interest,
            file
        )
        file.write("______________________________________</br></br></br></br>\n\n")

