## Interpret data colleccted in data.csv, and run statistics and meaningfull plots

### 1 - Creation of the data file (already done)

In [1]:
import csv
import os

headers = [
    "DateTime", "NumEpochs", "SwitchEpoch", "AlphaForEpochSup140", "Condition", "Digits", "SilhouetteScore", "NMI",
    "acc_0", "acc_1", "acc_2", "acc_3", "acc_4", "acc_5", "acc_6", "acc_7", "acc_8", "acc_9",
    "acc_mean", "SSIM_mean", "x_class_as_y", "y_class_as_x"
]

file_name = 'experiment.csv'

with open(file_name, 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)
    print(f"Created {file_name} with headers.")


Created experiment.csv with headers.


### 2 - Adding values to it (already done during experiments)

In [None]:
with open('experiment.csv', mode='a', newline='') as data_file:
    writer = csv.writer(data_file)
    writer.writerow([
        datetime.now(),
        num_epochs,
        swap_epoch,
        AlphaForEpochSup140,
        Condition,
        Digits,
        silhouette,
        nmi_score,
        [acc_epoch[0] for acc_epoch in test_acc],
        [acc_epoch[1] for acc_epoch in test_acc],
        [acc_epoch[2] for acc_epoch in test_acc],
        [acc_epoch[3] for acc_epoch in test_acc],
        [acc_epoch[4] for acc_epoch in test_acc],
        [acc_epoch[5] for acc_epoch in test_acc],
        [acc_epoch[6] for acc_epoch in test_acc],
        [acc_epoch[7] for acc_epoch in test_acc],
        [acc_epoch[8] for acc_epoch in test_acc],
        [acc_epoch[9] for acc_epoch in test_acc],
        [statistics.mean(epoch.values()) for epoch in test_acc],  
        test_ssim,
        [class_epoch[0] for class_epoch in tracked_class], # or np.nan if normal condtion
        [class_epoch[1] for class_epoch in tracked_class], # or np.nan if normal condtion
    ])

### 2 - Curating data

In [38]:
import pandas as pd
import numpy as np
import ast

# Function to safely evaluate string as list
def safe_eval(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

# Function to calculate mean of a specific range in a list
def calculate_range_mean(lst, start, end):
    return np.mean(lst[start-1:end]) if len(lst) >= end else np.nan

# Function to trim list to last 251 elements
def trim_list(x, condition):
    if condition == 'Apathy' and len(x) > 251:
        return x[-251:]
    return x

In [39]:
# Load the CSV file
df = pd.read_csv('experiment.csv')

In [40]:
# Convert string representations of lists to actual lists
list_columns = ["SilhouetteScore", "NMI", "acc_0", "acc_1", "acc_2", "acc_3", "acc_4", "acc_5", "acc_6", "acc_7", "acc_8", "acc_9",
    "acc_mean", "SSIM_mean", "x_class_as_y", "y_class_as_x"]
for col in list_columns:
    df[col] = df[col].apply(safe_eval)

In [41]:
# Trim x_class_as_y and y_class_as_x for Apathy condition
df['x_class_as_y'] = df.apply(lambda row: trim_list(row['x_class_as_y'], row['Condition']), axis=1)
df['y_class_as_x'] = df.apply(lambda row: trim_list(row['y_class_as_x'], row['Condition']), axis=1)

In [42]:
# Define the new columns to add and their corresponding base columns
new_columns = {
    'SilhouetteScore': ['SilhouetteScore_132-141', 'SilhouetteScore_142-151', 'SilhouetteScore_242-251'],
    'NMI': ['NMI_132-141', 'NMI_142-151', 'NMI_242-251'],
    'acc_mean': ['acc_mean_132-141', 'acc_mean_142-151', 'acc_mean_242-251'],
    'SSIM_mean': ['SSIM_mean_132-141', 'SSIM_mean_142-151', 'SSIM_mean_242-251'],
    'x_class_as_y': ['x_as_y_132-141', 'x_as_y_142-151', 'x_as_y_242-251'],
    'y_class_as_x': ['y_as_x_132-141', 'y_as_x_142-151', 'y_as_x_242-251']
}

In [43]:
# Add new columns and calculate their values
for base_col, new_cols in new_columns.items():
    for new_col in new_cols:
        start, end = map(int, new_col.split('_')[-1].split('-'))
        df[new_col] = df[base_col].apply(lambda x: calculate_range_mean(x, start, end))

In [44]:
# Reorder columns to place new columns after their base column
for base_col, new_cols in new_columns.items():
    insert_index = df.columns.get_loc(base_col) + 1
    for i, new_col in enumerate(new_cols):
        col_data = df.pop(new_col)
        df.insert(insert_index + i, new_col, col_data)

In [45]:
# Save the modified DataFrame back to CSV
df.to_csv('curated_data.csv', index=False)

print("CSV file has been modified and saved as 'curated_data.csv'")

CSV file has been modified and saved as 'curated_data.csv'


### 3 - Stats anaysis on key values

Sort by, and run statistics between the categories:
- Condition: "Normal", "Apathy", "Delusion"
- AlphaForSupEpoch140: 0.2; 0.5; 0.8
- +/- Digits (for Conditions "Apathy" and "Delusion"): see if the Digits as an effect on the results (hypothesis: yes)

Variables to compare:
- 'SilhouetteScore_132-141' to 'SilhouetteScore_142-151' and to 'SilhouetteScore_242-251'
- 'NMI_132-141' to 'NMI_142-151'and to 'NMI_242-251'
- 'acc_mean_132-141' to 'acc_mean_142-151' and to 'acc_mean_242-251'
- 'SSIM_mean_132-141' to 'SSIM_mean_142-151' and to 'SSIM_mean_242-251'
- 'x_as_y_132-141' to 'x_as_y_142-151' and to 'x_as_y_242-251'
- 'y_as_x_132-141' to 'y_as_x_142-151' and to 'y_as_x_242-251'

In [52]:
# Define the categories and variables for comparison
condition_pairs = [("Normal", "Apathy"), ("Normal", "Delusion")]
alpha_values = [0.2, 0.5, 0.8]
variables = ['SilhouetteScore', 'NMI', 'acc_mean', 'SSIM_mean']
variable_spe_condition = ['SilhouetteScore', 'NMI', 'acc_mean', 'SSIM_mean','x_class_as_y', 'y_class_as_x']
ranges = ['132-141', '142-151', '242-251']
digit_groups = [(0, 1), (4, 9), (3, 8)]

df['Digits'] = pd.to_numeric(df['Digits'], errors='coerce')

In [53]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns



def check_normality(data):
    _, p_value = stats.shapiro(data)
    return p_value > 0.05 # Returns True if data is likely normal (p > 0.05)


def run_statistical_tests(group1, group2):
    if check_normality(group1) and check_normality(group2):
        t_stat, p_value = stats.ttest_ind(group1, group2)
        test_name = "t-test"
    else:
        t_stat, p_value = stats.mannwhitneyu(group1, group2)
        test_name = "Mann-Whitney U test"
    return p_value, test_name


def analyze_variable(df, variable, condition_pair):
    normal_condition, other_condition = condition_pair
    print(f"\nAnalysis for {variable}: {normal_condition} vs {other_condition}")
    for alpha in alpha_values:
        print(f"  Alpha = {alpha}:")
        normal_subset = df[(df['Condition'] == normal_condition) & (df['AlphaForEpochSup140'] == alpha)]
        other_subset = df[(df['Condition'] == other_condition) & (df['AlphaForEpochSup140'] == alpha)]
        for range_val in ranges:
            var = f"{variable}_{range_val}"
            p_value, test_name = run_statistical_tests(normal_subset[var], other_subset[var])
            print(f"    {var}: p-value = {p_value:.4f} ({test_name})")

def analyze_digits_effect(df, variable, condition):
    print(f"\nAnalysis of Digits effect for {variable} in {condition} condition")
    for alpha in alpha_values:
        print(f"  Alpha = {alpha}:")
        subset = df[(df['Condition'] == condition) & (df['AlphaForEpochSup140'] == alpha)]
        for digit_group in digit_groups:
            subsets = subset[subset['Digits'] == digit_group]

        for range_val in ranges:
            var = f"{variable}_{range_val}"
            print(f"    {var}:")



In [None]:
def analyze_digits_effect(df, variable, condition):
    print(f"\nAnalysis of Digits effect for {variable} in {condition} condition")
    for alpha in alpha_values:
        print(f"  Alpha = {alpha}:")
        subset = df[(df['Condition'] == condition) & (df['AlphaForEpochSup140'] == alpha)]
        for digit_group in digit_groups:
            subsets = subset[subset['Digits'] == digit_group]      
        for range_val in ranges:
            var = f"{variable}_{range_val}"
            print(f"    {var}:")
            for digit in digit_group:
                digit_subset = subsets[subsets['Digits'] == digit]
                print(f"      Digit {digit}:")
                for range_val in ranges:
                    var = f"{variable}_{range_val}"
                    p_value, test_name = run_statistical_tests(digit_subset[var], other_subset[var])
                    print(f"        {var}: p-value = {p_value:.4f} ({test_name})")

In [None]:
def analyze_digits_effect(df, variable, condition):
    print(f"\nAnalysis of Digits effect for {variable} in {condition} condition")
    for alpha in alpha_values:
        print(f"  Alpha = {alpha}:")
        subset = df[(df['Condition'] == condition) & (df['AlphaForEpochSup140'] == alpha)]
        for range_val in ranges:
            var = f"{variable}_{range_val}"
            print(f"    {var}:")
            for i, (low1, high1) in enumerate(digit_groups):
                for j, (low2, high2) in enumerate(digit_groups[i+1:], start=i+1):
                    group1 = subset[(subset['Digits'] >= low1) & (subset['Digits'] <= high1)][var]
                    group2 = subset[(subset['Digits'] >= low2) & (subset['Digits'] <= high2)][var]
                    if len(group1) > 0 and len(group2) > 0:
                        p_value, test_name = run_statistical_tests(group1, group2)
                        print(f"      Digits {low1}-{high1} vs {low2}-{high2}: p-value = {p_value:.4f} ({test_name})")
                    else:
                        print(f"      Digits {low1}-{high1} vs {low2}-{high2}: Insufficient data for analysis")

In [54]:
# Load the modified CSV file
df = pd.read_csv('curated_data.csv')

# Convert 'Digits' column to numeric, replacing any non-numeric values with NaN
df['Digits'] = pd.to_numeric(df['Digits'], errors='coerce')

In [55]:
# Run analyses
condition_pair = ("Normal", "Apathy")
for variable in variables:
    analyze_variable(df, variable, condition_pair)


Analysis for SilhouetteScore: Normal vs Apathy
  Alpha = 0.2:
    SilhouetteScore_132-141: p-value = 0.9373 (t-test)
    SilhouetteScore_142-151: p-value = 0.0000 (t-test)
    SilhouetteScore_242-251: p-value = 0.0036 (t-test)
  Alpha = 0.5:
    SilhouetteScore_132-141: p-value = 0.2055 (t-test)
    SilhouetteScore_142-151: p-value = 0.0015 (Mann-Whitney U test)
    SilhouetteScore_242-251: p-value = 0.0048 (t-test)
  Alpha = 0.8:
    SilhouetteScore_132-141: p-value = 0.7392 (t-test)
    SilhouetteScore_142-151: p-value = 0.0459 (t-test)
    SilhouetteScore_242-251: p-value = 0.2470 (t-test)

Analysis for NMI: Normal vs Apathy
  Alpha = 0.2:
    NMI_132-141: p-value = 0.2408 (t-test)
    NMI_142-151: p-value = 0.0085 (t-test)
    NMI_242-251: p-value = 0.0001 (t-test)
  Alpha = 0.5:
    NMI_132-141: p-value = 0.4445 (Mann-Whitney U test)
    NMI_142-151: p-value = 0.0012 (t-test)
    NMI_242-251: p-value = 0.0655 (Mann-Whitney U test)
  Alpha = 0.8:
    NMI_132-141: p-value = 0.9328 

In [None]:
# To do later because problem in the analysis

condition = "Apathy"
for variable in variables:
        analyze_digits_effect(df, variable, condition)

In [70]:
def plot_variable_across_conditions(df, variable, condition_pair):
    normal_condition, other_condition = condition_pair
    plt.figure(figsize=(8, 6))
    for i, range_val in enumerate(ranges):
        var = f"{variable}_{range_val}"
        subset = df[df['Condition'].isin([normal_condition, other_condition])]
        sns.boxplot(x='Condition', y=var, hue='AlphaForEpochSup140', data=subset)
        if variable == 'acc_mean':
            #plt.ylim(65, 100)
            plt.ylabel("Accuracy (%)")
        if variable == 'NMI':
            #plt.ylim(0.4, 1)
            plt.ylabel("NMI")
        plt.title(f"Stats_{var}: {normal_condition} vs {other_condition}")
        if variable == 'SSIM_mean':
            #plt.ylim(-0.2, 0.2)
            plt.ylabel("SSIM")
        plt.savefig(f"./Plots/Stats/{variable}/Stats_{var}_{normal_condition}_vs_{other_condition}_0.png")
        plt.close()

def plot_individual_variable(df, variable, condition_pair, range_val):
    normal_condition, other_condition = condition_pair
    plt.figure(figsize=(8, 6))
    var = f"{variable}_{range_val}"
    subset = df[df['Condition'].isin([normal_condition, other_condition])]
    sns.boxplot(x='Condition', y=var, hue='AlphaForEpochSup140', data=subset)
    if variable == 'acc_mean':
        plt.ylim(65, 100)
        plt.ylabel("Accuracy (%)")
    if variable == 'NMI':
        plt.ylim(0.4, 1)
        plt.ylabel("NMI")
    if variable == 'SSIM_mean':
        plt.ylim(0.1, 0.2)
        plt.ylabel("SSIM")
    plt.title(f"Stats_{var}: {normal_condition} vs {other_condition}")
    plt.savefig(f"./Plots/Stats/{variable}/Stats_{var}_{normal_condition}_vs_{other_condition}_1.png")
    plt.close()

def plot_digits_effect(df, variable, condition, range_val):
    plt.figure(figsize=(10, 6))
    var = f"{variable}_{range_val}"
    subset = df[df['Condition'] == condition]
    sns.boxplot(x='Digits', y=var, hue='AlphaForEpochSup140', data=subset)
    plt.title(f"Stats_{var}: Digit Effect in {condition} Condition")
    plt.savefig(f"./Plots/Stats/{variable}/Stats_{var}_{condition}_digits_effect_boxplot.png")
    plt.close()

In [71]:
condition_pair = ("Normal", "Apathy")
for variable in variables:
        plot_variable_across_conditions(df, variable, condition_pair)
        for range_val in ranges:
                plot_individual_variable(df, variable, condition_pair, range_val)
for range_val in ranges:
        plot_individual_variable(df, variable, condition_pair, range_val)

In [None]:
# To work on the visualization function
for condition in ["Apathy", "Delusion"][0]:
    for variable in variables:
        for range_val in ranges:
            plot_digits_effect(df, variable, condition, range_val)

### 5 - Plotting examples

- Plot the mean with standard deviation across of the values
- For variable of interest (SilhouetteScore, NMI, acc_mean, SSIM_mean): histogram of the 3 timepoints (before condition, after condition, end of traning) for each condition. 

Looks like 12 bars in 4 batches well spaced; with 3 first bars are with Condition "Normal", and with timepoints "132-141", then "142_151", then "242_251"; 

then the 3 second bars are with Condition "Apathy" and Digits "(0,1)", and with timepoints "132-141", then "142_151", then "242_251"; 

and the 3 third bars are with Condition "Apathy" and Digits "(4,9)", and with timepoints "132-141", then "142_151", then "242_251"; 

and the 3 last bars are with Condition "Apathy" and Digits "(3,8)", and with timepoints "132-141", then "142_151", then "242_251"

- do the same with delusion